jfOOddlZddlmZmZddlmZmZmZmZ ddl m Z n #e $re Z YnwxYwddl mZmZmZmZddlmZmZmZmZddlmZdd lmZmZdd lmZmZmZm Z m!Z!m"Z"ej#d Z$ej%Z&e&'ej(d  d!de)de*de*de+dee dee de,de,defdZ- d!dede*de*de+dee dee de,de,defdZ. d!de de*de*de+dee dee de,de,defdZ/ d"de de*de*de+dee dee de,defd Z0dS)#N)basenamesplitext)BinaryIOListOptionalSet)PathLike)coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratios)IANA_SUPPORTEDTOO_BIG_SEQUENCETOO_SMALL_SEQUENCETRACE) mess_ratio) CharsetMatchCharsetMatches)any_specified_encoding iana_nameidentify_sig_or_bom is_cp_similaris_multi_byte_encodingshould_strip_sig_or_bomcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s皙?TF sequencessteps chunk_size threshold cp_isolation cp_exclusionpreemptive_behaviourexplainreturnc ht|ttfs/tdt ||rJt j}t tt tt|} | dkrt d|rEt tt |p tjt#t%|dddgdgS|At td d |d |D}ng}|At td d |d |D}ng}| ||zkr't td||| d}| }|dkr| |z |krt+| |z }t|t,k} t|t.k} | r4t td| n5| r3t td| g} |rt1|nd} | 6| | t td| t5}g}g}d}d}d}t#}t7|\}}|D| |t tdt||| dd| vr| d| t8zD]}|r||vr |r||vr||vr||d}||k}|ot=|}|dvr$|s"t td|l t?|}n8#t@tBf$r$t td|YwxYw | rS|durOtE|dur|dt+dn#|t|t+d|n,tE|dur|n|t|d|}nx#tFtHf$rd}t|tHs/t td|tE|||Yd}~d}~wwxYwd}|D]}tK||rd}n|r$t td||tM|sdnt|| t+| |z }|o|duot|| k} | r!t td|t+t|dz }!tO|!d}!d}"d}#g}$g}%|D]m}&|&|z| d zkr||&|&|z}'|r |dur||'z}' |'(||rd!nd"#}(nK#tF$r>}t td$|tE||!}"d}#Yd}~nd}~wwxYw|r|&dkr~||&d%krrtS|d&})|r`|(d|)|vrTtM|&|&dz d'D]?}*||*|&|z}'|r |dur||'z}'|'(|d!#}(|(d|)|vrn@|$|(|%tU|(||%d'|kr|"dz }"|"|!ks|r|durno|#s| r|s |t+d(d(|d"#n\#tF$rO}t td)|tE|||Yd}~d}~wwxYw|%rtW|%t|%z nd}+|+|ks|"|!kr}||t td*||"tY|+d+zd,-|dd| fvr*|#s(t%|||dg|},|| kr|,}n |dkr|,}n|,}Ct td.|tY|+d+zd,-|st[|}-nt]|}-|-rAt td/|tE|-g}.|dkrB|$D]?}(t_|(d0|-rd1|-nd}/|.|/@ta|.}0|0r4t td2|0||t%|||+||0||| ddfvrt|+d0krnt d3||r9t tt |t#||gcS||krnt d4||r9t tt |t#||gcSt|dkr|s|s|r t td5|r6t d6|j1||n{|r||r|r|j2|j2ks|0t d7||n1|r/t d8|||rDt d9|3j1t|dz nt d:|r9t tt ||S);ae Given a raw bytes sequence, return the best possibles charset usable to render str objects. If there is no results, it is a strong indicator that the source is binary/not text. By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence. And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will. The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page but never take it for granted. Can improve the performance. You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that purpose. This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32. By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain' toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging. Custom logging format and handler can be set manually. z4Expected object of type bytes or bytearray, got: {0}rzzfrom_bytes..]"DDD "e,,DDDzacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.c.g|]}t|dSr-r.r/s r2r3zfrom_bytes..hr4r5z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.r z>Trying to detect encoding from a tiny portion of ({}) byte(s).zIUsing lazy str decoding because the payload is quite large, ({}) byte(s).z@Detected declarative mark in sequence. Priority +1 given for %s.zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>utf_16utf_32z[Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.z2Encoding %s does not provide an IncrementalDecodergA)encodingz9Code page %s does not fit given bytes sequence at ALL. %sTzW%s is deemed too similar to code page %s and was consider unsuited already. Continuing!zpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.ignorestrict)errorszaLazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %sgj@z^LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %szc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.d)ndigitsz=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {}g?,z We detected language {} using {}z.Encoding detection: %s is most likely the one.zoEncoding detection: %s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.zONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z7Encoding detection: %s will be used as a fallback matchz:Encoding detection: utf_8 will be used as a fallback matchz:Encoding detection: ascii will be used as a fallback matchz]Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.z=Encoding detection: Unable to determine any suitable charset.)4 isinstance bytearraybytes TypeErrorformattypeloggerlevel addHandlerexplain_handlersetLevelrlendebug removeHandlerloggingWARNINGrrlogjoinintrrrappendsetrraddrrModuleNotFoundError ImportErrorstrUnicodeDecodeError LookupErrorrrangemaxdecodeminrsumroundr r r rr: fingerprintbest)1r r!r"r#r$r%r&r'previous_logger_levellengthis_too_small_sequenceis_too_large_sequenceprioritized_encodingsspecified_encodingtestedtested_but_hard_failuretested_but_soft_failurefallback_ascii fallback_u8fallback_specifiedresults sig_encoding sig_payload encoding_ianadecoded_payloadbom_or_sig_availablestrip_sig_or_bomis_multi_byte_decoderesimilar_soft_failure_testencoding_soft_failedr_multi_byte_bonusmax_chunk_gave_upearly_stop_countlazy_str_hard_failure md_chunks md_ratiosi cut_sequencechunkchunk_partial_size_chkjmean_mess_ratiofallback_entrytarget_languages cd_ratioschunk_languagescd_ratios_mergeds1 r2 from_bytesr%sd8 i)U!3 4 4  B I IY      & /*** ^^F {{ STTT  F   1 1 1 OO1DW_ E E E|IwUBPRSSTUUU  5 IIl # #    ED|DDD    6 IIl # #    ED|DDD   *u$%%  l         qyyVe^j00%((  NN-?? NN.>>    L S S         W ^ ^     .BKy)))t%$$%7888  N    UUF  NKG 3I > >L+$$\222  W          )))+++$$W---.?`<`<  M==   M\99  F " "  =!!!+}</ 4K 5 5  0 0 09M 0 JJm     $:=$I$I ! !#[1    JJD    H   $ )>%)G)G'500kD k**"3{#3#3c$ii#?@* #&'500I"3{#3#3#5#56* ### #K0   a--  O!FF  $ * *= 9 9 9 HHHH %*!$;   ],@AA ,0)  %  JJi$      ) ?AAs;/?/?       " .t+ .O$$v-    JJ-     B! ,, 1155 %  8 8 A:~ **$QZ%78L# :(8E(A(A*\9  $++!'<J88(,&    w!FF  $5 (,% % "Q9Q<43G3G),Z)<)<&$ "5556oMM"1a!eR00 " "'0Q^1C'D /F4D4M4M+6+EL , 3 3M( 3 S S !8"8!89_LL!EM   U # # #   Zy99 : : :} )) A%  $555$6)9U)B)B & % *  #d))++&--mH-MMMM%    t!FF  (..}=== 09 AC NNS^^ + +c  i ' '+;?P+P+P # * *= 9 9 9 JJ0 o+Q777    '74F!GGG-H".}iO""!$666)7&&"g--%3NN"0K   K  /C' 3 3 3    % D1-@@  4]CC    JJ8??!3'7#8#8     G # #" 2 2"13>N X)9 : : :TX##  11111)<<   JJ299$m     $      0'7C C C#%% LL@-    7$$_555 5666!7=#9":;; ; ; ; L ( ( LL1     7$$_555 5666!7=#9":;; ; ; ; ) 7||q  . ,>  JJa     + LLI"+    NN- . . . .  ++34#4 +~/III' LLU V V V NN; ' ' ' '  + LLU V V V NN> * * *V k LLNN # LL1      TUUU/_----... Ns]P&&2QQBS!!U2AUU*Z [3[  [7,_$$ `=.A`88`=fpc Rt||||||||S)z Same thing than the function from_bytes but using a file pointer that is already ready. Will not close the file pointer. )rread)rr!r"r#r$r%r&r's r2from_fprs6       r5pathc t|d5}t||||||||cdddS#1swxYwYdS)z Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode. Can raise IOError. rbN)openr) rr!r"r#r$r%r&r'rs r2 from_pathrs dD    R                             s 488c tt|||||||}t|}tt|} t |dkr"t d||} | dxxd| jzz cc<tdt| |d | d5} | | dddn #1swxYwY| S)zi Take a (text-based) file path and try to create another file next to it, this time using UTF-8. rz;Unable to normalize "{}", no encoding charset seems to fit.-z{}r+wbN)rrlistrrSIOErrorrLrjr:rr`replacerYwriteoutput) rr!r"r#r$r%r&rwfilenametarget_extensionsresultrs r2 normalizer7sn  G~~HXh//00 7||q I P P     \\^^FaC&/11  CII%%h8I0J0JKKLLd  "  !!!""""""""""""""" Ms9(D--D14D1)rrrNNTF)rrrNNT)1rVos.pathrrtypingrrrrosr r_r`cdr r r rconstantrrrrmdrmodelsrrutilsrrrrrr getLoggerrN StreamHandlerrQ setFormatter FormatterrJrZfloatboolrrrrr5r2rs&&&&&&&&000000000000HHH RQQQQQQQQQQQ00000000  / 0 0'''))GABB""!%ZZZ ZZ Z s) Z s) ZZZZZZZ~""!%   s)  s) 8""!%       s)  s)        :""!%)) ) )) ) s) ) s) ))))))))s !++