
     fL              
       4   d dl mZ d dlmZ d dlmZmZ ddlmZm	Z	m
Z
 ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ  G d d          Z G d	 d
e          Z G d de          Z G d de          Z  G d de          Z! G d de          Z" G d de          Z# G d de          Z$ G d de          Z% G d de          Z& ed          dee'         dee'         de(fd             Z) ed!          	 d)d$e'd%e*d&e(de*fd'            Z+d(S )*    )	lru_cache)	getLogger)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                   V    e Zd ZdZdedefdZdeddfdZd	dZe	de
fd            ZdS )
MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                     t           )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr   s     X/var/www/api.educacionweb.es/myenv/lib/python3.11/site-packages/charset_normalizer/md.pyeligiblezMessDetectorPlugin.eligible%   
     "!    Nc                     t           )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r!   r#   s     r%   feedzMessDetectorPlugin.feed+   s
    
 "!r(   c                     t           )zB
        Permit to reset the plugin to the initial state.
        r!   r$   s    r%   resetzMessDetectorPlugin.reset2   r'   r(   c                     t           )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r!   r,   s    r%   ratiozMessDetectorPlugin.ratio8   s
     "!r(   r   N)__name__
__module____qualname____doc__strboolr&   r*   r-   propertyfloatr/    r(   r%   r   r      s         
"# "$ " " " ""c "d " " " "" " " " "u " " " X" " "r(   r   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
 TooManySymbolOrPunctuationPluginr   Nc                 L    d| _         d| _        d| _        d | _        d| _        d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr,   s    r%   __init__z)TooManySymbolOrPunctuationPlugin.__init__B   s0    '("#%&37!,1###r(   r   c                 *    |                                 S Nisprintabler#   s     r%   r&   z)TooManySymbolOrPunctuationPlugin.eligibleJ       $$&&&r(   c                 (   | xj         dz  c_         || j        k    ro|t          vrft          |          r| xj        dz  c_        nF|                                du r0t          |          r!t          |          du r| xj        dz  c_        || _        d S )Nr   F   )	r?   r@   r   r   r=   isdigitr   r   r>   r#   s     r%   r*   z%TooManySymbolOrPunctuationPlugin.feedM   s    " 222!===i(( (''1,'''!!##u,,i(( -	**e33""a'""$-!!!r(   c                 0    d| _         d| _        d| _        d S Nr   )r=   r?   r>   r,   s    r%   r-   z&TooManySymbolOrPunctuationPlugin.reset_   s     "# !r(   c                 ^    | j         dk    rdS | j        | j        z   | j         z  }|dk    r|ndS )Nr           333333?)r?   r=   r>   )r$   ratio_of_punctuations     r%   r/   z&TooManySymbolOrPunctuationPlugin.ratiod   sK     A%%3 #d&88!'" (<s'B'B##Kr(   r0   r1   r2   r3   rB   r5   r6   r&   r*   r-   r7   r8   r/   r9   r(   r%   r;   r;   A   s        2 2 2 2'# '$ ' ' ' '.c .d . . . .$   
 Lu L L L XL L Lr(   r;   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
TooManyAccentuatedPluginr   Nc                 "    d| _         d| _        d S rL   r?   _accentuated_countr,   s    r%   rB   z!TooManyAccentuatedPlugin.__init__q   s    %&'(r(   r   c                 *    |                                 S rD   )isalphar#   s     r%   r&   z!TooManyAccentuatedPlugin.eligibleu   s      """r(   c                 h    | xj         dz  c_         t          |          r| xj        dz  c_        d S d S Nr   )r?   r   rV   r#   s     r%   r*   zTooManyAccentuatedPlugin.feedx   sJ    ")$$ 	)##q(####	) 	)r(   c                 "    d| _         d| _        d S rL   rU   r,   s    r%   r-   zTooManyAccentuatedPlugin.reset~   s     !"#r(   c                 N    | j         dk     rdS | j        | j         z  }|dk    r|ndS )N   rN   gffffff?rU   )r$   ratio_of_accentuations     r%   r/   zTooManyAccentuatedPlugin.ratio   s<     1$$3'+'>AV'V(=(E(E$$3Nr(   r0   rQ   r9   r(   r%   rS   rS   p   s        ) ) ) )## #$ # # # #)c )d ) ) ) )$ $ $ $ Ou O O O XO O Or(   rS   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
UnprintablePluginr   Nc                 "    d| _         d| _        d S rL   )_unprintable_countr?   r,   s    r%   rB   zUnprintablePlugin.__init__   s    '(%&r(   r   c                     dS NTr9   r#   s     r%   r&   zUnprintablePlugin.eligible       tr(   c                 d    t          |          r| xj        dz  c_        | xj        dz  c_        d S rZ   )r   rb   r?   r#   s     r%   r*   zUnprintablePlugin.feed   s@    )$$ 	)##q(##"r(   c                     d| _         d S rL   )rb   r,   s    r%   r-   zUnprintablePlugin.reset   s    "#r(   c                 @    | j         dk    rdS | j        dz  | j         z  S )Nr   rN   r]   )r?   rb   r,   s    r%   r/   zUnprintablePlugin.ratio   s+     A%%3'!+t/DDDr(   r0   rQ   r9   r(   r%   r`   r`      s        ' ' ' '# $    #c #d # # # #
$ $ $ $ Eu E E E XE E Er(   r`   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
SuspiciousDuplicateAccentPluginr   Nc                 0    d| _         d| _        d | _        d S rL   _successive_countr?   _last_latin_characterr,   s    r%   rB   z(SuspiciousDuplicateAccentPlugin.__init__   s     &'%&48"""r(   r   c                 H    |                                 ot          |          S rD   )rX   r   r#   s     r%   r&   z(SuspiciousDuplicateAccentPlugin.eligible   s!      "":x	':'::r(   c                 l   | xj         dz  c_         | j        t          |          rt          | j                  rr|                                r)| j                                        r| xj        dz  c_        t          |          t          | j                  k    r| xj        dz  c_        || _        d S rZ   )r?   rn   r   isupperrm   r   r#   s     r%   r*   z$SuspiciousDuplicateAccentPlugin.feed   s    "&2y)) 3t9:: 3   "" ,t'A'I'I'K'K ,&&!+&&Y''=9S+T+TTT&&!+&&%."""r(   c                 0    d| _         d| _        d | _        d S rL   rl   r,   s    r%   r-   z%SuspiciousDuplicateAccentPlugin.reset   s     !" !%)"""r(   c                 @    | j         dk    rdS | j        dz  | j         z  S )Nr   rN   rI   )r?   rm   r,   s    r%   r/   z%SuspiciousDuplicateAccentPlugin.ratio   s+     A%%3&*d.CCCr(   r0   rQ   r9   r(   r%   rj   rj      s        9 9 9 9;# ;$ ; ; ; ;/c /d / / / /* * * *
 Du D D D XD D Dr(   rj   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
SuspiciousRanger   Nc                 0    d| _         d| _        d | _        d S rL   )"_suspicious_successive_range_countr?   _last_printable_seenr,   s    r%   rB   zSuspiciousRange.__init__   s     78/%&37!!!r(   r   c                 *    |                                 S rD   rE   r#   s     r%   r&   zSuspiciousRange.eligible   rG   r(   c                 D   | xj         dz  c_         |                                st          |          s	|t          v r	d | _        d S | j        	|| _        d S t          | j                  }t          |          }t          ||          r| xj        dz  c_        || _        d S rZ   )r?   isspacer   r   rx   r    is_suspiciously_successive_rangerw   )r$   r   unicode_range_aunicode_range_bs       r%   r*   zSuspiciousRange.feed   s    " 	i((	 888(,D%F$,(1D%F)6t7P)Q)Q)6y)A)A+O_MM 	933q833$-!!!r(   c                 0    d| _         d| _        d | _        d S rL   )r?   rw   rx   r,   s    r%   r-   zSuspiciousRange.reset   s      !23/$(!!!r(   c                 D    | j         dk    rdS | j        dz  | j         z  }|S )N   rN   rI   )r?   rw   )r$   ratio_of_suspicious_range_usages     r%   r/   zSuspiciousRange.ratio   s8     B&&3 3a7!2"' /.r(   r0   rQ   r9   r(   r%   ru   ru      s        8 8 8 8
'# '$ ' ' ' '.c .d . . . ..) ) ) )
 /u / / / X/ / /r(   ru   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
SuperWeirdWordPluginr   Nc                     d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d S )Nr   F )	_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr?   _bad_character_count_buffer_buffer_accent_countr,   s    r%   rB   zSuperWeirdWordPlugin.__init__   sO     !$%() */!). %&)*!)*!!!r(   r   c                     dS rd   r9   r#   s     r%   r&   zSuperWeirdWordPlugin.eligible  re   r(   c                    |                                 r| xj        |z  c_        t          |          r| xj        dz  c_        | j        du r|t          |          du st          |          r\t          |          du rKt          |          du r:t          |          du r)t          |          du rt          |          du rd| _        d S | j        sd S |                                st          |          st          |          r| j        r| xj        dz  c_        t          | j                  }| xj        |z  c_        |dk    r| j        |z  dk    rd| _        t          | j        d                   rV| j        d                                         r7t'          d | j        D                       du r| xj        dz  c_        d| _        |dk    ri| j        rbd	 t+          | j        t-          d
|                    D             }d}|rt          |          |z  dk    rd}|s| xj        dz  c_        d| _        | j        r9| xj        dz  c_        | xj        t          | j                  z  c_        d| _        d| _        d| _        d
| _        d S |dvr>|                                du r*t5          |          rd| _        | xj        |z  c_        d S d S d S d S )Nr   FT   g(\?c              3   >   K   | ]}|                                 V  d S rD   rq   ).0_s     r%   	<genexpr>z,SuperWeirdWordPlugin.feed.<locals>.<genexpr>-  s*      >>AAIIKK>>>>>>r(   r   c                 @    g | ]\  }}|                                 |S r9   r   )r   cis      r%   
<listcomp>z-SuperWeirdWordPlugin.feed.<locals>.<listcomp>2  s:     " " "1yy{{"" " "r(   r   rO   r   >   r   -<=>|~)rX   r   r   r   r   r   r   r   r   r   r   r{   r   r   r   lenr?   r   rq   allr   zipranger   r   rJ   r   )r$   r   buffer_lengthcamel_case_dstprobable_camel_caseds        r%   r*   zSuperWeirdWordPlugin.feed  sf    	LLI%LLi(( /))Q.))(E11i((E11^I5N5N19%%..i((E11	**e33	**e33I&&%//+/(F| 	F1	&#1)#<#<1	&@LY@W@W1	&l1	& !!$T\!2!2M!!]2!!!!,}<tCC04D- #4<#3445R(00225 >>>>>>>%GG,,1,,04D-""t'?"" " #DL%=2I2I J J" " "
 .3$! 0s>':':]'Jc'Q'Q+/(+ 5,,1,,04D-( 2$$)$$))S->->>)),1)',D$DL()D%%%@@@!!##u,,)$$ - )-D%LLI%LLLL A@,,,,r(   c                 v    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d S )Nr   Fr   )r   r   r   r   r   r?   r   r   r,   s    r%   r-   zSuperWeirdWordPlugin.resetP  sG    $)!#(   !$%!#$   r(   c                 P    | j         dk    r| j        dk    rdS | j        | j        z  S )N
   r   rN   )r   r   r   r?   r,   s    r%   r/   zSuperWeirdWordPlugin.ratioZ  s3    r!!d&>!&C&C3(4+@@@r(   r0   rQ   r9   r(   r%   r   r      s        + + + +# $    C&c C&d C& C& C& C&J% % % % Au A A A XA A Ar(   r   c                   ^    e Zd ZdZd
dZdedefdZdeddfdZd
dZ	e
defd	            ZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    r   Nc                 "    d| _         d| _        d S rL   _wrong_stop_count_cjk_character_countr,   s    r%   rB   zCjkInvalidStopPlugin.__init__h  s    &')*!!!r(   r   c                     dS rd   r9   r#   s     r%   r&   zCjkInvalidStopPlugin.eligiblel  re   r(   c                 t    |dv r| xj         dz  c_         d S t          |          r| xj        dz  c_        d S d S )N>      丄   丅r   )r   r   r   r#   s     r%   r*   zCjkInvalidStopPlugin.feedo  sZ    &&""a'""F) 	+%%*%%%%	+ 	+r(   c                 "    d| _         d| _        d S rL   r   r,   s    r%   r-   zCjkInvalidStopPlugin.resetv  s    !"$%!!!r(   c                 :    | j         dk     rdS | j        | j         z  S )N   rN   )r   r   r,   s    r%   r/   zCjkInvalidStopPlugin.ratioz  s&    $r))3%(AAAr(   r0   )r1   r2   r3   r4   rB   r5   r6   r&   r*   r-   r7   r8   r/   r9   r(   r%   r   r   b  s         
+ + + +# $    +c +d + + + +& & & & Bu B B B XB B Br(   r   c                   Z    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd            ZdS )
ArchaicUpperLowerPluginr   Nc                 h    d| _         d| _        d| _        d| _        d| _        d | _        d| _        d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr?   _last_alpha_seen_current_ascii_onlyr,   s    r%   rB   z ArchaicUpperLowerPlugin.__init__  s?    	45,23*890%&/3)-   r(   r   c                     dS rd   r9   r#   s     r%   r&   z ArchaicUpperLowerPlugin.eligible  re   r(   c                    |                                 ot          |          }|du }|r| j        dk    rt| j        dk    r4|                                du r| j        du r| xj        | j        z  c_        d| _        d| _        d | _        d| _        | xj	        dz  c_	        d| _        d S | j        du r|
                                du rd| _        | j        |                                r| j                                        s-|                                rB| j                                        r)| j        du r| xj        dz  c_        d| _        nd| _        nd| _        | xj	        dz  c_	        | xj        dz  c_        || _        d S )NFr   @   r   TrI   )rX   r   r   rJ   r   r   r   r   r   r?   isasciirq   islower)r$   r   is_concerned	chunk_seps       r%   r*   zArchaicUpperLowerPlugin.feed  s    ((**J/?	/J/J E)	 	=AA4::%%''500,5588688 23D.34D0$(D!DI!!Q&!!'+D$F#t++	0A0A0C0Cu0L0L',D$ ,!!## 	"(=(E(E(G(G 	"!!##	"(,(=(E(E(G(G	" 9$$66!;66 %DII $DII!	",,1,, )r(   c                 h    d| _         d| _        d| _        d| _        d | _        d| _        d| _        d S )Nr   FT)r?   r   r   r   r   r   r   r,   s    r%   r-   zArchaicUpperLowerPlugin.reset  s?     !/0,-.*340 $	#'   r(   c                 :    | j         dk    rdS | j        | j         z  S )Nr   rN   )r?   r   r,   s    r%   r/   zArchaicUpperLowerPlugin.ratio  s&     A%%37$:OOOr(   r0   rQ   r9   r(   r%   r   r     s        . . . .# $    (*c (*d (* (* (* (*T( ( ( ( Pu P P P XP P Pr(   r   c                   Z    e Zd Zd	dZd	dZdedefdZdeddfdZe	de
fd            ZdS )
ArabicIsolatedFormPluginr   Nc                 "    d| _         d| _        d S rL   r?   _isolated_form_countr,   s    r%   rB   z!ArabicIsolatedFormPlugin.__init__  s    %&)*!!!r(   c                 "    d| _         d| _        d S rL   r   r,   s    r%   r-   zArabicIsolatedFormPlugin.reset  s     !$%!!!r(   r   c                      t          |          S rD   )r   r#   s     r%   r&   z!ArabicIsolatedFormPlugin.eligible  s    ###r(   c                 h    | xj         dz  c_         t          |          r| xj        dz  c_        d S d S rZ   )r?   r   r   r#   s     r%   r*   zArabicIsolatedFormPlugin.feed  sJ    ""9-- 	+%%*%%%%	+ 	+r(   c                 >    | j         dk     rdS | j        | j         z  }|S )Nr]   rN   r   )r$   isolated_form_usages     r%   r/   zArabicIsolatedFormPlugin.ratio  s,     1$$3%)%>AV%V""r(   r0   )r1   r2   r3   rB   r-   r5   r6   r&   r*   r7   r8   r/   r9   r(   r%   r   r     s        + + + +& & & &$# $$ $ $ $ $+c +d + + + + #u # # # X# # #r(   r      )maxsizer}   r~   r   c                    | |dS | |k    rdS d| v rd|v rdS d| v sd|v rdS d| v sd|v r
d| v sd|v rdS |                      d          |                     d          }}|D ]}|t          v r||v r dS | dv |dv }}|s|r
d	| v sd	|v rdS |r|rdS d
| v sd
|v rd	| v sd	|v rdS | dk    s|dk    rdS d	| v sd	|v s| dv r&|dv r"d| v sd|v rdS d| v sd|v rdS | dk    s|dk    rdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationForms)splitr
   )r}   r~   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charss          r%   r|   r|     s    /"9t/))u/!!g&@&@uo%%)G)Gu 	?""g&@&@&&+*H*Hu)8)>)>* *S!! '   000!!!55 "
 	
	

 	33 ' 	 ,   E_$<$<u , u?""h/&A&AO##u'?'?5m++-/O/O5 	  E_$<$<333777O++}/O/O5o%%O)C)C5m++-/O/O54r(   i   皙?Fdecoded_sequencemaximum_thresholddebugc           	      Z   d t                                           D             }t          |           dz   }d}|dk     rd}n|dk    rd}nd}t          | d	z   t	          |                    D ]m\  }}|D ],}	|	                    |          r|	                    |           -|d
k    r	||z  d
k    s	||dz
  k    r!t          d |D                       }||k    r nn|rt          d          }
|
	                    t          d| d| d|            t          |           dk    rL|
	                    t          d| dd                     |
	                    t          d| dd                     |D ],}|
	                    t          |j         d|j                    -t          |d          S )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                 "    g | ]} |            S r9   r9   )r   md_classs     r%   r   zmess_ratio.<locals>.<listcomp>:  s+     + + +

+ + +r(   r   rN   i       r   r      
r   c              3   $   K   | ]}|j         V  d S rD   )r/   )r   dts     r%   r   zmess_ratio.<locals>.<genexpr>Q  s$      !?!?r"(!?!?!?!?!?!?r(   charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r   zStarting with: NzEnding with: iz:    )r   __subclasses__r   r   r   r&   r*   sumr   logr	   	__class__r/   round)r   r   r   	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr   s               r%   
mess_ratior  2  s1   + +#5#D#D#F#F+ + +I &''!+F O||13))	4,.)),/) 04 7vGG  	5! 	) 	)H  ++ )i((( AII%"CCqHHfqj  !!?!?Y!?!?!???O"333 =/00

51R5 5et5 5!25 5	
 	
 	
   2%%JJuG0@"0EGGHHHJJuG.>suu.EGGHHH 	= 	=BJJu;;;;<<<<!$$$r(   N)r   F),	functoolsr   loggingr   typingr   r   constantr   r	   r
   utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r;   rS   r`   rj   ru   r   r   r   r   r5   r6   r|   r8   r  r9   r(   r%   <module>r     s               ! ! ! ! ! ! ! !         
                                     *" " " " " " " "D,L ,L ,L ,L ,L'9 ,L ,L ,L^O O O O O1 O O O6E E E E E* E E E0"D "D "D "D "D&8 "D "D "DJ./ ./ ./ ./ ./( ./ ./ ./bfA fA fA fA fA- fA fA fARB B B B B- B B B>IP IP IP IP IP0 IP IP IPX# # # # #1 # # #8 4Ec]E5=c]E	E E E EP 4IN4% 4%4%.34%BF4%
4% 4% 4% 4% 4% 4%r(   