# charset is ASCII, as the standard (RFC XXX) requires. try: if isinstance(input_charset, str): input_charset.encode('ascii') else: input_charset = str(input_charset, 'ascii') except UnicodeError: raise errors.CharsetError(input_charset) input_charset = input_charset.lower() # Set the input charset after filtering through the aliases self.input_charset = ALIASES.get(input_charset, input_charset) # We can try to guess which encoding and conversion to use by the # charset_map dictionary. Try that first, but let the user override # it. henc, benc, conv = CHARSETS.get(self.input_charset, (SHORTEST, BASE64, None)) if not conv: conv = self.input_charset # Set the attributes, allowing the arguments to override the default. self.header_encoding = henc self.body_encoding = benc self.output_charset = ALIASES.get(conv, conv) # Now set the codecs. If one isn't defined for input_charset, # guess and try a Unicode codec with the same name as input_codec. self.input_codec = CODEC_MAP.get(self.input_charset, self.input_charset) self.output_codec = CODEC_MAP.get(self.output_charset, self.output_charset) def __repr__(self): return self.input_charset.lower() def __eq__(self, other): return str(self) == str(other).lower() def get_body_encoding(self): """Return the content-transfer-encoding used for body encoding. This is either the string `quoted-printable' or `base64' depending on the encoding used, or it is a function in which case you should call the function with a single argument, the Message object being encoded. The function should then set the Content-Transfer-Encoding header itself to whatever is appropriate. Returns "quoted-printable" if self.body_encoding is QP. Returns "base64" if self.body_encoding is BASE64. Returns conversion function otherwise. """ assert self.body_encoding != SHORTEST if self.body_encoding == QP: return 'quoted-printable' elif self.body_encoding == BASE64: return 'base64' else: return encode_7or8bit def get_output_charset(self): """Return the output character set. This is self.output_charset if that is not None, otherwise it is self.input_charset. """ return self.output_charset or self.input_charset def header_encode(self, string): """Header-encode a string by converting it first to bytes. The type of encoding (base64 or quoted-printable) will be based on this charset's `header_encoding`. :param string: A unicode string for the header. It must be possible to encode this string to bytes using the character set's output codec. :return: The encoded string, with RFC 2047 chrome. """ codec = self.output_codec or 'us-ascii' header_bytes = _encode(string, codec) # 7bit/8bit encodings return the string unchanged (modulo conversions) encoder_module = self._get_encoder(header_bytes) if encoder_module is None: return string return encoder_module.header_encode(header_bytes, codec) def header_encode_lines(self, string, maxlengths): """Header-encode a string by converting it first to bytes. This is similar to `header_encode()` except that the string is fit into maximum line lengths as given by the argument. :param string: A unicode string for the header. It must be possible to encode this string to bytes using the character set's output codec. :param maxlengths: Maximum line length iterator. Each element returned from this iterator will provide the next maximum line length. This parameter is used as an argument to built-in next() and should never be exhausted. The maximum line lengths should not count the RFC 2047 chrome. These line lengths are only a hint; the splitter does the best it can. :return: Lines of encoded strings, each with RFC 2047 chrome. """ # See which encoding we should use. codec = self.output_codec or 'us-ascii' header_bytes = _encode(string, codec) encoder_module = self._get_encoder(header_bytes) encoder = partial(encoder_module.header_encode, charset=codec) # Calculate the number of characters that the RFC 2047 chrome will # contribute to each line. charset = self.get_output_charset() extra = len(charset) + RFC2047_CHROME_LEN # Now comes the hard part. We must encode bytes but we can't split on # bytes because some character sets are variable length and each # encoded word must stand on its own. So the problem is you have to # encode to bytes to figure out this word's length, but you must split # on characters. This causes two problems: first, we don't know how # many octets a specific substring of unicode characters will get # encoded to, and second, we don't know how many ASCII characters # those octets will get encoded to. Unless we try it. Which seems # inefficient. In the interest of being correct rather than fast (and # in the hope that there will be few encoded headers in any such # message), brute force it. :( lines = [] current_line = [] maxlen = next(maxlengths) - extra for character in string: current_line.append(character) this_line = EMPTYSTRING.join(current_line) length = encoder_module.header_length(_encode(this_line, charset)) if length > maxlen: # This last character doesn't fit so pop it off. current_line.pop() # Does nothing fit on the first line? if not lines and not current_line: lines.append(None) else: separator = (' ' if lines else '') joined_line = EMPTYSTRING.join(current_line) header_bytes = _encode(joined_line, codec) lines.append(encoder(header_bytes)) current_line = [character] maxlen = next(maxlengths) - extra joined_line = EMPTYSTRING.join(current_line) header_bytes = _encode(joined_line, codec) lines.append(encoder(header_bytes)) return lines def _get_encoder(self, header_bytes): if self.header_encoding == BASE64: return email.base64mime elif self.header_encoding == QP: return email.quoprimime elif self.header_encoding == SHORTEST: len64 = email.base64mime.header_length(header_bytes) lenqp = email.quoprimime.header_length(header_bytes) if len64 < lenqp: return email.base64mime else: return email.quoprimime else: return None def body_encode(self, string): """Body-encode a string by converting it first to bytes. The type of encoding (base64 or quoted-printable) will be based on self.body_encoding. If body_encoding is None, we assume the output charset is a 7bit encoding, so re-encoding the decoded string using the ascii codec produces the correct string version of the content. """ if not string: return string if self.body_encoding is BASE64: if isinstance(string, str): string = string.encode(self.output_charset) return email.base64mime.body_encode(string) elif self.body_encoding is QP: # quopromime.body_encode takes a string, but operates on it as if # it were a list of byte codes. For a (minimal) history on why # this is so, see changeset 0cf700464177. To correctly encode a # character set, then, we must turn it into pseudo bytes via the # latin1 charset, which will encode any byte as a single code point # between 0 and 255, which is what body_encode is expecting. if isinstance(string, str): string = string.encode(self.output_charset) string = string.decode('latin1') return email.quoprimime.body_encode(string) else: if isinstance(string, str): string = string.encode(self.output_charset).decode('ascii') return string