from__future__importannotationsfromtypingimportTYPE_CHECKING,Any,Optionalfromwarningsimportwarnfrom.apiimportfrom_bytesfrom.constantimportCHARDET_CORRESPONDENCE# TODO: remove this check when dropping Python 3.7 supportifTYPE_CHECKING:fromtyping_extensionsimportTypedDictclassResultDict(TypedDict):encoding:Optional[str]language:strconfidence:Optional[float]
[docs]defdetect(byte_str:bytes,should_rename_legacy:bool=False,**kwargs:Any)->ResultDict:""" chardet legacy method Detect the encoding of the given byte string. It should be mostly backward-compatible. Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it) This function is deprecated and should be used to migrate your project easily, consult the documentation for further information. Not planned for removal. :param byte_str: The byte sequence to examine. :param should_rename_legacy: Should we rename legacy encodings to their more modern equivalents? """iflen(kwargs):warn(f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()")ifnotisinstance(byte_str,(bytearray,bytes)):raiseTypeError(# pragma: nocover"Expected object of type bytes or bytearray, got: ""{0}".format(type(byte_str)))ifisinstance(byte_str,bytearray):byte_str=bytes(byte_str)r=from_bytes(byte_str).best()encoding=r.encodingifrisnotNoneelseNonelanguage=r.languageifrisnotNoneandr.language!="Unknown"else""confidence=1.0-r.chaosifrisnotNoneelseNone# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process# but chardet does return 'utf-8-sig' and it is a valid codec name.ifrisnotNoneandencoding=="utf_8"andr.bom:encoding+="_sig"ifshould_rename_legacyisFalseandencodinginCHARDET_CORRESPONDENCE:encoding=CHARDET_CORRESPONDENCE[encoding]return{"encoding":encoding,"language":language,"confidence":confidence,}