22import chardet
33
44def get_encoding (page ):
5+ # Regex for XML and HTML Meta charset declaration
6+ charset_re = re .compile (r'<meta.*?charset=["\']*(.+?)["\'>]' , flags = re .I )
7+ pragma_re = re .compile (r'<meta.*?content=["\']*;?charset=(.+?)["\'>]' , flags = re .I )
8+ xml_re = re .compile (r'^<\?xml.*?encoding=["\']*(.+?)["\'>]' )
9+
10+ declared_encodings = (charset_re .findall (page ) +
11+ pragma_re .findall (page ) +
12+ xml_re .findall (page ))
13+
14+ # Try any declared encodings
15+ if len (declared_encodings ) > 0 :
16+ for declared_encoding in declared_encodings :
17+ try :
18+ page .decode (custom_decode (declared_encoding ))
19+ return custom_decode (declared_encoding )
20+ except UnicodeDecodeError :
21+ pass
22+
23+ # Fallback to chardet if declared encodings fail
524 text = re .sub ('</?[^>]*>\s*' , ' ' , page )
625 enc = 'utf-8'
726 if not text .strip () or len (text ) < 10 :
827 return enc # can't guess
9- try :
10- diff = text .decode (enc , 'ignore' ).encode (enc )
11- sizes = len (diff ), len (text )
12- if abs (len (text ) - len (diff )) < max (sizes ) * 0.01 : # 99% of utf-8
13- return enc
14- except UnicodeDecodeError :
15- pass
1628 res = chardet .detect (text )
1729 enc = res ['encoding' ]
1830 #print '->', enc, "%.2f" % res['confidence']
19- if enc == 'MacCyrillic' :
20- enc = 'cp1251'
31+ enc = custom_decode (enc )
2132 return enc
33+
34+ def custom_decode (encoding ):
35+ """Overrides encoding when charset declaration
36+ or charset determination is a subset of a larger
37+ charset. Created because of issues with Chinese websites"""
38+ encoding = encoding .lower ()
39+ alternates = {
40+ 'big5' : 'big5hkscs' ,
41+ 'gb2312' : 'gb18030' ,
42+ 'ascii' : 'utf-8' ,
43+ 'MacCyrillic' : 'cp1251' ,
44+ }
45+ if encoding in alternates :
46+ return alternates [encoding ]
47+ else :
48+ return encoding
0 commit comments