Support#
Here are a list of supported encoding and supported language with latest update. Also this list may change depending of your python version.
Supported Encodings#
Charset Normalizer is able to detect any of those encoding. This list is NOT static and depends heavily on what your current cPython version is shipped with. See https://docs.python.org/3/library/codecs.html#standard-encodings
IANA Code Page |
Aliases |
---|---|
ascii |
646, ansi_x3.4_1968, ansi_x3_4_1968, ansi_x3.4_1986, cp367, csascii, ibm367, iso646_us, iso_646.irv_1991, iso_ir_6, us, us_ascii |
big5 |
big5_tw, csbig5, x_mac_trad_chinese |
big5hkscs |
big5_hkscs, hkscs |
cp037 |
037, csibm037, ebcdic_cp_ca, ebcdic_cp_nl, ebcdic_cp_us, ebcdic_cp_wt, ibm037, ibm039 |
cp1026 |
1026, csibm1026, ibm1026 |
cp1125 |
1125, ibm1125, cp866u, ruscii |
cp1140 |
1140, ibm1140 |
cp1250 |
1250, windows_1250 |
cp1251 |
1251, windows_1251 |
cp1252 |
1252, windows_1252 |
cp1253 |
1253, windows_1253 |
cp1254 |
1254, windows_1254 |
cp1255 |
1255, windows_1255 |
cp1256 |
1256, windows_1256 |
cp1257 |
1257, windows_1257 |
cp1258 |
1258, windows_1258 |
cp273 |
273, ibm273, csibm273 |
cp424 |
424, csibm424, ebcdic_cp_he, ibm424 |
cp437 |
437, cspc8codepage437, ibm437 |
cp500 |
500, csibm500, ebcdic_cp_be, ebcdic_cp_ch, ibm500 |
cp775 |
775, cspc775baltic, ibm775 |
cp850 |
850, cspc850multilingual, ibm850 |
cp852 |
852, cspcp852, ibm852 |
cp855 |
855, csibm855, ibm855 |
cp857 |
857, csibm857, ibm857 |
cp858 |
858, csibm858, ibm858 |
cp860 |
860, csibm860, ibm860 |
cp861 |
861, cp_is, csibm861, ibm861 |
cp862 |
862, cspc862latinhebrew, ibm862 |
cp863 |
863, csibm863, ibm863 |
cp864 |
864, csibm864, ibm864 |
cp865 |
865, csibm865, ibm865 |
cp866 |
866, csibm866, ibm866 |
cp869 |
869, cp_gr, csibm869, ibm869 |
cp932 |
932, ms932, mskanji, ms_kanji |
cp949 |
949, ms949, uhc |
cp950 |
950, ms950 |
euc_jis_2004 |
jisx0213, eucjis2004, euc_jis2004 |
euc_jisx0213 |
eucjisx0213 |
euc_jp |
eucjp, ujis, u_jis |
euc_kr |
euckr, korean, ksc5601, ks_c_5601, ks_c_5601_1987, ksx1001, ks_x_1001, x_mac_korean |
gb18030 |
gb18030_2000 |
gb2312 |
chinese, csiso58gb231280, euc_cn, euccn, eucgb2312_cn, gb2312_1980, gb2312_80, iso_ir_58, x_mac_simp_chinese |
gbk |
936, cp936, ms936 |
hp_roman8 |
roman8, r8, csHPRoman8 |
hz |
hzgb, hz_gb, hz_gb_2312 |
iso2022_jp |
csiso2022jp, iso2022jp, iso_2022_jp |
iso2022_jp_1 |
iso2022jp_1, iso_2022_jp_1 |
iso2022_jp_2 |
iso2022jp_2, iso_2022_jp_2 |
iso2022_jp_3 |
iso2022jp_3, iso_2022_jp_3 |
iso2022_jp_ext |
iso2022jp_ext, iso_2022_jp_ext |
iso2022_kr |
csiso2022kr, iso2022kr, iso_2022_kr |
iso8859_10 |
csisolatin6, iso_8859_10, iso_8859_10_1992, iso_ir_157, l6, latin6 |
iso8859_11 |
thai, iso_8859_11, iso_8859_11_2001 |
iso8859_13 |
iso_8859_13, l7, latin7 |
iso8859_14 |
iso_8859_14, iso_8859_14_1998, iso_celtic, iso_ir_199, l8, latin8 |
iso8859_15 |
iso_8859_15, l9, latin9 |
iso8859_16 |
iso_8859_16, iso_8859_16_2001, iso_ir_226, l10, latin10 |
iso8859_2 |
csisolatin2, iso_8859_2, iso_8859_2_1987, iso_ir_101, l2, latin2 |
iso8859_3 |
csisolatin3, iso_8859_3, iso_8859_3_1988, iso_ir_109, l3, latin3 |
iso8859_4 |
csisolatin4, iso_8859_4, iso_8859_4_1988, iso_ir_110, l4, latin4 |
iso8859_5 |
csisolatincyrillic, cyrillic, iso_8859_5, iso_8859_5_1988, iso_ir_144 |
iso8859_6 |
arabic, asmo_708, csisolatinarabic, ecma_114, iso_8859_6, iso_8859_6_1987, iso_ir_127 |
iso8859_7 |
csisolatingreek, ecma_118, elot_928, greek, greek8, iso_8859_7, iso_8859_7_1987, iso_ir_126 |
iso8859_8 |
csisolatinhebrew, hebrew, iso_8859_8, iso_8859_8_1988, iso_ir_138 |
iso8859_9 |
csisolatin5, iso_8859_9, iso_8859_9_1989, iso_ir_148, l5, latin5 |
iso2022_jp_2004 |
iso_2022_jp_2004, iso2022jp_2004 |
johab |
cp1361, ms1361 |
koi8_r |
cskoi8r |
kz1048 |
kz_1048, rk1048, strk1048_2002 |
latin_1 |
8859, cp819, csisolatin1, ibm819, iso8859, iso8859_1, iso_8859_1, iso_8859_1_1987, iso_ir_100, l1, latin, latin1 |
mac_cyrillic |
maccyrillic |
mac_greek |
macgreek |
mac_iceland |
maciceland |
mac_latin2 |
maccentraleurope, maclatin2 |
mac_roman |
macintosh, macroman |
mac_turkish |
macturkish |
mbcs |
ansi, dbcs |
ptcp154 |
csptcp154, pt154, cp154, cyrillic_asian |
rot_13 |
rot13 |
shift_jis |
csshiftjis, shiftjis, sjis, s_jis, x_mac_japanese |
shift_jis_2004 |
shiftjis2004, sjis_2004, s_jis_2004 |
shift_jisx0213 |
shiftjisx0213, sjisx0213, s_jisx0213 |
tactis |
tis260 |
tis_620 |
tis620, tis_620_0, tis_620_2529_0, tis_620_2529_1, iso_ir_166 |
utf_16 |
u16, utf16 |
utf_16_be |
unicodebigunmarked, utf_16be |
utf_16_le |
unicodelittleunmarked, utf_16le |
utf_32 |
u32, utf32 |
utf_32_be |
utf_32be |
utf_32_le |
utf_32le |
utf_8 |
u8, utf, utf8, utf8_ucs2, utf8_ucs4 (+utf_8_sig) |
utf_7 |
u7, unicode-1-1-utf-7 |
Supported Languages#
Those language can be detected inside your content. All of these are specified in ./charset_normalizer/assets/__init__.py .
English, German, French, Dutch, Italian, Polish, Spanish, Russian, Japanese, Portuguese, Swedish, Chinese, Ukrainian, Norwegian, Finnish, Vietnamese, Czech, Hungarian, Korean, Indonesian, Turkish, Romanian, Farsi, Arabic, Danish, Serbian, Lithuanian, Slovene, Slovak, Malay, Hebrew, Bulgarian, Croatian, Hindi, Estonian, Simple English, Thai, Greek, Tamil, Classical Chinese.