U eL@snddlmZddlmZddlmZmZddlmZm Z m Z ddl m Z m Z mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZGdddZGd d d eZGd d d eZGd ddeZ GdddeZ!GdddeZ"GdddeZ#GdddeZ$GdddeZ%GdddeZ&eddee'ee'e(dddZ)ed dd'e'e*e(e*d#d$d%Z+d&S)() lru_cache) getLogger)ListOptional)COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated is_arabicis_arabic_isolated_formis_case_variableis_cjk is_emoticon is_hangul is_hiragana is_katakanais_latinis_punctuation is_separator is_symbolis_thaiis_unprintable remove_accent unicode_rangec@sPeZdZdZeedddZeddddZddd d Ze e dd d Z dS) MessDetectorPluginzy Base abstract class used for mess detection plugins. All detectors MUST extend and implement given methods.  characterreturncCstdS)z@ Determine if given character should be fed in. NNotImplementedErrorselfrr#E/opt/hc_python/lib64/python3.8/site-packages/charset_normalizer/md.pyeligible%szMessDetectorPlugin.eligibleNcCstdS)z The main routine to be executed upon character. Insert the logic in witch the text would be considered chaotic. Nrr!r#r#r$feed+szMessDetectorPlugin.feedrcCstdS)zB Permit to reset the plugin to the initial state. Nrr"r#r#r$reset2szMessDetectorPlugin.resetcCstdS)z Compute the chaos ratio based on what your feed() has seen. Must NOT be lower than 0.; No restriction gt 0. Nrr(r#r#r$ratio8szMessDetectorPlugin.ratio) __name__ __module__ __qualname____doc__strboolr%r&r)propertyfloatr*r#r#r#r$rs rc@sZeZdZddddZeedddZedddd Zddd d Ze e dd d Z dS) TooManySymbolOrPunctuationPluginNr'cCs"d|_d|_d|_d|_d|_dS)NrF)_punctuation_count _symbol_count_character_count_last_printable_charZ_frenzy_symbol_in_wordr(r#r#r$__init__Bs z)TooManySymbolOrPunctuationPlugin.__init__rcCs|SN isprintabler!r#r#r$r%Jsz)TooManySymbolOrPunctuationPlugin.eligiblecCsp|jd7_||jkrf|tkrft|r8|jd7_n.|dkrft|rft|dkrf|jd7_||_dS)NrF) r6r7rrr4isdigitrrr5r!r#r#r$r&Ms  z%TooManySymbolOrPunctuationPlugin.feedcCsd|_d|_d|_dSNr)r4r6r5r(r#r#r$r)_sz&TooManySymbolOrPunctuationPlugin.resetcCs0|jdkrdS|j|j|j}|dkr,|SdS)Nr333333?)r6r4r5)r"Zratio_of_punctuationr#r#r$r*ds   z&TooManySymbolOrPunctuationPlugin.ratio r+r,r-r8r/r0r%r&r)r1r2r*r#r#r#r$r3As r3c@sZeZdZddddZeedddZedddd Zddd d Ze e dd d Z dS)TooManyAccentuatedPluginNr'cCsd|_d|_dSr>r6_accentuated_countr(r#r#r$r8qsz!TooManyAccentuatedPlugin.__init__rcCs|Sr9)isalphar!r#r#r$r%usz!TooManyAccentuatedPlugin.eligiblecCs(|jd7_t|r$|jd7_dSNr)r6r rDr!r#r#r$r&xszTooManyAccentuatedPlugin.feedcCsd|_d|_dSr>rCr(r#r#r$r)~szTooManyAccentuatedPlugin.resetcCs*|jdkrdS|j|j}|dkr&|SdS)Nr?gffffff?rC)r"Zratio_of_accentuationr#r#r$r*s  zTooManyAccentuatedPlugin.ratiorAr#r#r#r$rBps rBc@sZeZdZddddZeedddZedddd Zddd d Ze e dd d Z dS)UnprintablePluginNr'cCsd|_d|_dSr>)_unprintable_countr6r(r#r#r$r8szUnprintablePlugin.__init__rcCsdSNTr#r!r#r#r$r%szUnprintablePlugin.eligiblecCs(t|r|jd7_|jd7_dSrF)rrIr6r!r#r#r$r&szUnprintablePlugin.feedcCs d|_dSr>)rIr(r#r#r$r)szUnprintablePlugin.resetcCs|jdkrdS|jd|jS)Nrr?rG)r6rIr(r#r#r$r*s zUnprintablePlugin.ratiorAr#r#r#r$rHs rHc@sZeZdZddddZeedddZedddd Zddd d Ze e dd d Z dS)SuspiciousDuplicateAccentPluginNr'cCsd|_d|_d|_dSr>_successive_countr6_last_latin_characterr(r#r#r$r8sz(SuspiciousDuplicateAccentPlugin.__init__rcCs|ot|Sr9)rErr!r#r#r$r%sz(SuspiciousDuplicateAccentPlugin.eligiblecCst|jd7_|jdk rjt|rjt|jrj|rJ|jrJ|jd7_t|t|jkrj|jd7_||_dSrF)r6rNr isupperrMrr!r#r#r$r&sz$SuspiciousDuplicateAccentPlugin.feedcCsd|_d|_d|_dSr>rLr(r#r#r$r)sz%SuspiciousDuplicateAccentPlugin.resetcCs|jdkrdS|jd|jS)Nrr?r<)r6rMr(r#r#r$r*s z%SuspiciousDuplicateAccentPlugin.ratiorAr#r#r#r$rKs rKc@sZeZdZddddZeedddZedddd Zddd d Ze e dd d Z dS)SuspiciousRangeNr'cCsd|_d|_d|_dSr>)"_suspicious_successive_range_countr6_last_printable_seenr(r#r#r$r8szSuspiciousRange.__init__rcCs|Sr9r:r!r#r#r$r%szSuspiciousRange.eligiblecCsx|jd7_|s&t|s&|tkr0d|_dS|jdkrD||_dSt|j}t|}t||rn|jd7_||_dSrF)r6isspacerrrRr is_suspiciously_successive_rangerQ)r"runicode_range_aunicode_range_br#r#r$r&s"   zSuspiciousRange.feedcCsd|_d|_d|_dSr>)r6rQrRr(r#r#r$r)szSuspiciousRange.resetcCs"|jdkrdS|jd|j}|S)Nr?r<)r6rQ)r"Zratio_of_suspicious_range_usager#r#r$r*s  zSuspiciousRange.ratiorAr#r#r#r$rPs rPc@sZeZdZddddZeedddZedddd Zddd d Ze e dd d Z dS)SuperWeirdWordPluginNr'cCs:d|_d|_d|_d|_d|_d|_d|_d|_d|_dS)NrF) _word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr6_bad_character_count_buffer_buffer_accent_countr(r#r#r$r8szSuperWeirdWordPlugin.__init__rcCsdSrJr#r!r#r#r$r%szSuperWeirdWordPlugin.eligiblecCs6|r|j|7_t|r,|jd7_|jdkrt|dksJt|rt|dkrt|dkrt|dkrt |dkrt |dkrd|_dS|jsdS| st |st |r|jr|jd7_t|j}|j|7_|dkrP|j|dkrd|_t|jdrP|jdrPtdd|jDdkrP|jd7_d|_|d kr|jrd d t|jtd |D}d}|rt||d krd}|s|jd7_d|_|jr|jd7_|jt|j7_d|_d|_d|_d |_n6|dkr2|dkr2t|r2d|_|j|7_dS)NrFTg(\?css|]}|VqdSr9rO).0_r#r#r$ -sz,SuperWeirdWordPlugin.feed..rWcSsg|]\}}|r|qSr#rd)recir#r#r$ 2sz-SuperWeirdWordPlugin.feed..rr@rY><~-=|>rf)rEr`r rar^rrrrrrrSrrrZlenr6r]rOallr\zipranger[r_r=r)r"rZ buffer_lengthZcamel_case_dstZprobable_camel_casedr#r#r$r& s            zSuperWeirdWordPlugin.feedcCs4d|_d|_d|_d|_d|_d|_d|_d|_dS)NrYFr)r`r]r^r[rZr6r_r\r(r#r#r$r)PszSuperWeirdWordPlugin.resetcCs$|jdkr|jdkrdS|j|jS)N rr?)rZr\r_r6r(r#r#r$r*ZszSuperWeirdWordPlugin.ratiorAr#r#r#r$rXs E rXc@s^eZdZdZddddZeedddZeddd d Zddd d Z e e dd dZ dS)CjkInvalidStopPluginu GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected. Searching for the overuse of '丅' and '丄'. Nr'cCsd|_d|_dSr>_wrong_stop_count_cjk_character_countr(r#r#r$r8hszCjkInvalidStopPlugin.__init__rcCsdSrJr#r!r#r#r$r%lszCjkInvalidStopPlugin.eligiblecCs4|dkr|jd7_dSt|r0|jd7_dS)N>丄丅r)rxrryr!r#r#r$r&os zCjkInvalidStopPlugin.feedcCsd|_d|_dSr>rwr(r#r#r$r)vszCjkInvalidStopPlugin.resetcCs|jdkrdS|j|jS)Nr?)ryrxr(r#r#r$r*zs zCjkInvalidStopPlugin.ratio) r+r,r-r.r8r/r0r%r&r)r1r2r*r#r#r#r$rvbsrvc@sZeZdZddddZeedddZedddd Zddd d Ze e dd d Z dS)ArchaicUpperLowerPluginNr'cCs.d|_d|_d|_d|_d|_d|_d|_dS)NFrT)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr6_last_alpha_seen_current_ascii_onlyr(r#r#r$r8sz ArchaicUpperLowerPlugin.__init__rcCsdSrJr#r!r#r#r$r%sz ArchaicUpperLowerPlugin.eligiblecCs$|ot|}|dk}|r|jdkr|jdkrV|dkrV|jdkrV|j|j7_d|_d|_d|_d|_|j d7_ d|_dS|jdkr| dkrd|_|jdk r| r|j s| r|j r|jdkr|jd7_d|_qd|_nd|_|j d7_ |jd7_||_dS)NFr@rTr<) rEr rr=rrrrr~r6isasciirOislower)r"rZ is_concernedZ chunk_sepr#r#r$r&sF   zArchaicUpperLowerPlugin.feedcCs.d|_d|_d|_d|_d|_d|_d|_dS)NrFT)r6rrrrr~rr(r#r#r$r)szArchaicUpperLowerPlugin.resetcCs|jdkrdS|j|jS)Nrr?)r6rr(r#r#r$r*s zArchaicUpperLowerPlugin.ratiorAr#r#r#r$r}s  * r}c@sZeZdZddddZddddZeeddd Zeddd d Ze e dd d Z dS)ArabicIsolatedFormPluginNr'cCsd|_d|_dSr>r6_isolated_form_countr(r#r#r$r8sz!ArabicIsolatedFormPlugin.__init__cCsd|_d|_dSr>rr(r#r#r$r)szArabicIsolatedFormPlugin.resetrcCst|Sr9)r r!r#r#r$r%sz!ArabicIsolatedFormPlugin.eligiblecCs(|jd7_t|r$|jd7_dSrF)r6r rr!r#r#r$r&szArabicIsolatedFormPlugin.feedcCs|jdkrdS|j|j}|S)NrGr?r)r"Zisolated_form_usager#r#r$r*s  zArabicIsolatedFormPlugin.ratio) r+r,r-r8r)r/r0r%r&r1r2r*r#r#r#r$rs r)maxsize)rUrVrcCs|dks|dkrdS||kr dSd|kr4d|kr4dSd|ksDd|krHdSd|ksXd|krld|kshd|krldS|d|d}}|D]}|tkrq||krdSq|dk|dk}}|s|rd |ksd |krdS|r|rdSd |ksd |kr d |ksd |krdS|d ks|d kr dSd |ksHd |ksH|d kr|d krd |ks\d |kr`dSd|kstd|krxdS|d ks|d krdSdS)za Determine if two Unicode range seen next to each other can be considered as suspicious. NTFZLatinZ EmoticonsZ Combining )HiraganaKatakanaCJKHangulz Basic Latin)rr PunctuationZForms)splitr )rUrVZkeywords_range_aZkeywords_range_belZrange_a_jp_charsZrange_b_jp_charsr#r#r$rTslrTi皙?F)decoded_sequencemaximum_thresholddebugrc CsXddtD}t|d}d}|dkr0d}n|dkr>d}nd }t|d t|D]d\}}|D]} | |r`| |q`|d kr||d ks||dkrTtd d |D}||krTqqT|rNtd} | t d|d|d|t|dkr(| t d|dd| t d|dd|D] } | t | j d| j q,t |dS)zw Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. cSsg|] }|qSr#r#)reZmd_classr#r#r$rj:szmess_ratio..rr?i rr rcss|] }|jVqdSr9)r*)redtr#r#r$rgQszmess_ratio..Zcharset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r|zStarting with: Nz Ending with: iz: )r__subclasses__rqrsrtr%r&sumrlogr __class__r*round) rrrZ detectorslengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcrindexdetectorloggerrr#r#r$ mess_ratio2sF     rN)rF), functoolsrloggingrtypingrrZconstantrrr utilsr r r r rrrrrrrrrrrrrrr3rBrHrKrPrXrvr}rr/r0rTr2rr#r#r#r$s8  L"/%1iLH