U eiR @sddlZddlmZddlmZmZmZmZmZddl m Z m Z m Z m Z ddlmZmZmZmZddlmZddlmZmZdd lmZmZmZmZmZmZmZe d Z!e"Z#e#$e%d dee&e'fe(e(e)eee*eee*e+e+e)e+ed ddZ,dee(e(e)eee*eee*e+e+e)e+ed ddZ-d ee*e&efe(e(e)eee*eee*e+e+e)e+ed ddZ.d!eee*ee&fe(e(e)eee*eee*e+e+e)e+e+d ddZ/dS)"N)PathLike)BinaryIOListOptionalSetUnion)coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratios)IANA_SUPPORTEDTOO_BIG_SEQUENCETOO_SMALL_SEQUENCETRACE) mess_ratio) CharsetMatchCharsetMatches)any_specified_encodingcut_sequence_chunks iana_nameidentify_sig_or_bom is_cp_similaris_multi_byte_encodingshould_strip_sig_or_bomZcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s皙?TF皙?) sequencessteps chunk_size threshold cp_isolation cp_exclusionpreemptive_behaviourexplainlanguage_thresholdenable_fallbackreturnc / Cs t|ttfs tdt||r>tj} tt t t t |} | dkrt d|rvtt t | prtjtt|dddgdgS|dk rtt d d |d d |D}ng}|dk rtt d d |dd |D}ng}| ||krtt d||| d}| }|dkr:| ||kr:t| |}t |tk} t |tk} | rltt d| n| rtt d| g}|rt|nd}|dk r||tt d|t}g}g}d}d}d}t}t|\}}|dk r||tt dt |||dd|kr.|d|tD]}|rP||krPq6|rd||krdq6||krrq6||d}||k}|ot|}|dkr|stt d|q6|dkr|stt d|q6z t|}Wn.t t!fk rtt d|Yq6YnXzr| r`|dkr`t"|dkrD|dtdn|t |td|dn&t"|dkrp|n|t |d|d}Wn\t#t$fk r}z8t|t$stt d|t"|||WYq6W5d}~XYnXd}|D]} t%|| rd}q q|r&tt d|| q6t&|s2dnt || t| |}!|ob|dk obt || k}"|"rxtt d |tt |!d!}#t'|#d"}#d}$d}%g}&g}'zt(|||!|||||| D]|}(|&|(|'t)|(||dkodt |kod"kn|'d#|kr|$d7}$|$|#ks0|r|dkrq:qWn@t#k r|}z tt d$|t"||#}$d}%W5d}~XYnX|%s| r|sz|td%dj*|d&d'WnLt#k r}z,tt d(|t"|||WYq6W5d}~XYnX|'rt+|'t |'nd})|)|ks*|$|#kr||tt d)||$t,|)d*d+d,| r6|dd|fkr6|%s6t|||dg|}*||kr|*}n|dkr|*}n|*}q6tt d-|t,|)d*d+d,|st-|}+nt.|}+|+rtt d.|t"|+g},|dkr:|&D],}(t/|(||+r&d/|+nd}-|,|-q t0|,}.|.r\tt d0|.||t|||)||.|||ddfkr|)d1krt d2||rtt t | t||gS||kr6t d3||rtt t | t||gSq6t |dkr|s,|s,|r8tt d4|rXt d5|j1||nd|rh|dks|r|r|j2|j2ks|dk rt d6||n|rt d7|||rt d8|3j1t |dn t d9| rtt t | |S):af Given a raw bytes sequence, return the best possibles charset usable to render str objects. If there is no results, it is a strong indicator that the source is binary/not text. By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence. And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will. The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page but never take it for granted. Can improve the performance. You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that purpose. This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32. By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain' toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging. Custom logging format and handler can be set manually. z4Expected object of type bytes or bytearray, got: {0}rz[szfrom_bytes..zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.cSsg|]}t|dqSr,r-r.r1r1r2r3fsz^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.rz>Trying to detect encoding from a tiny portion of ({}) byte(s).zIUsing lazy str decoding because the payload is quite large, ({}) byte(s).z@Detected declarative mark in sequence. Priority +1 given for %s.zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>utf_32utf_16z\Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.>utf_7zREncoding %s won't be tested as-is because detection is unreliable without BOM/SIG.z2Encoding %s does not provide an IncrementalDecodergA)encodingz9Code page %s does not fit given bytes sequence at ALL. %sTzW%s is deemed too similar to code page %s and was consider unsuited already. Continuing!zpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.zaLazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %sgj@strict)errorsz^LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %szc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.d)ndigitsz=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {},z We detected language {} using {}rz.Encoding detection: %s is most likely the one.zoEncoding detection: %s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.zONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z7Encoding detection: %s will be used as a fallback matchz:Encoding detection: utf_8 will be used as a fallback matchz:Encoding detection: ascii will be used as a fallback matchz]Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.z=Encoding detection: Unable to determine any suitable charset.)4 isinstance bytearraybytes TypeErrorformattypeloggerlevel addHandlerexplain_handlersetLevelrlendebug removeHandlerloggingWARNINGrrlogjoinintrrrappendsetrr addrrModuleNotFoundError ImportErrorstrUnicodeDecodeError LookupErrorrrangemaxrrdecodesumroundr r r r r8 fingerprintbest)/rr r!r"r#r$r%r&r'r(Zprevious_logger_levellengthZis_too_small_sequenceZis_too_large_sequenceZprioritized_encodingsZspecified_encodingZtestedZtested_but_hard_failureZtested_but_soft_failureZfallback_asciiZ fallback_u8Zfallback_specifiedresultsZ sig_encodingZ sig_payloadZ encoding_ianaZdecoded_payloadZbom_or_sig_availableZstrip_sig_or_bomZis_multi_byte_decodereZsimilar_soft_failure_testZencoding_soft_failedZr_Zmulti_byte_bonusZmax_chunk_gave_upZearly_stop_countZlazy_str_hard_failureZ md_chunksZ md_ratioschunkZmean_mess_ratioZfallback_entryZtarget_languagesZ cd_ratiosZchunk_languagesZcd_ratios_mergedr1r1r2 from_bytes!s                            &                         rh) fpr r!r"r#r$r%r&r'r(r)c Cst|||||||||| S)z Same thing than the function from_bytes but using a file pointer that is already ready. Will not close the file pointer. )rhread) rir r!r"r#r$r%r&r'r(r1r1r2from_fpsrk) pathr r!r"r#r$r%r&r'r(r)c Cs<t|d(} t| ||||||||| W5QRSQRXdS)z Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode. Can raise IOError. rbN)openrk) rlr r!r"r#r$r%r&r'r(rir1r1r2 from_paths ro) fp_or_path_or_payloadr r!r"r#r$r%r&r'r(r)c Cszt|ttfr,t|||||||||| d } nHt|ttfrXt|||||||||| d } nt|||||||||| d } | S)a) Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string. Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match are disabled to be stricter around ASCII-compatible but unlikely to be a string. ) r r!r"r#r$r%r&r'r()rBrZrrorDrCrhrk) rpr r!r"r#r$r%r&r'r(Zguessesr1r1r2 is_binary3sX   rq) rrrNNTFrT) rrrNNTFrT) rrrNNTFrT) rrrNNTFrF)0rPosrtypingrrrrrcdr r r r Zconstantr rrrmdrmodelsrrutilsrrrrrrr getLoggerrH StreamHandlerrK setFormatter FormatterrDrCrTfloatrZboolrhrkrorqr1r1r1r2s  $     Z       !