Requests 使用的是 urllib3,因此继承了它的所有特性。Requests 支持 HTTP 连接保持和连接池,支持使用 cookie 保持会话,支持文件上传,支持自动确定响应内容的编码,支持国际化的 URL 和 POST 数据自动编码。现代、国际化、人性化。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
| def get_encoding_from_headers(headers): """Returns encodings from given HTTP Header Dict.
:param headers: dictionary to extract encoding from. """ content_type = headers.get('content-type')
if not content_type: return None
content_type, params = cgi.parse_header(content_type)
if 'charset' in params: return params['charset'].strip("'\"")
if 'text' in content_type: return 'ISO-8859-1'
1 2 3 4 5 6 7 8 9 10 11 12
| def get_encodings_from_content(content): """Returns encodings from given content string.
:param content: bytestring to extract encodings from. """ charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I) pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I) xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
return (charset_re.findall(content) + pragma_re.findall(content) + xml_re.findall(content))
1 2 3 4 5
| @property def apparent_encoding(self): """The apparent encoding, provided by the lovely Charade library (Thanks, Ian!).""" return chardet.detect(self.content)['encoding']
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
| >>> r = requests.get('') >>> r.headers['content-type'] 'text/html' >>> r.encoding 'ISO-8859-1' >>> r.apparent_encoding 'utf-8' >>> requests.utils.get_encodings_from_content(r.content) ['utf-8']
>>> r = requests.get('') >>> r.headers['content-type'] 'text/html' >>> r.encoding 'ISO-8859-1' >>> r.apparent_encoding 'gb2312' >>> requests.utils.get_encodings_from_content(r.content) ['gb2312']
通过了解,可以这么用一个monkey patch解决这个问题:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| import requests def monkey_patch(): prop = requests.models.Response.content def content(self): _content = prop.fget(self) if self.encoding == 'ISO-8859-1': encodings = requests.utils.get_encodings_from_content(_content) if encodings: self.encoding = encodings[0] else: self.encoding = self.apparent_encoding _content = _content.decode(self.encoding, 'replace').encode('utf8', 'replace') self._content = _content return _content requests.models.Response.content = property(content) monkey_patch()