B
    cD                 @   sJ  d dl mZ d dlmZmZ ddlmZmZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZmZ G dd dZG dd	 d	eZG d
d deZG dd deZG dd deZG dd deZG dd deZG dd deZ G dd deZ!eddee" ee" e#dddZ$eddd$e"e%e#e%d d!d"Z&d#S )%    )	lru_cache)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuatedis_asciiis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec               @   sP   e Zd ZdZeedddZeddddZddd	d
Ze	e
dddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    )	characterreturnc             C   s   t dS )z@
        Determine if given character should be fed in.
        N)NotImplementedError)selfr    r   E/tmp/pip-install-h9fvvp4j/charset-normalizer/charset_normalizer/md.pyeligible   s    zMessDetectorPlugin.eligibleNc             C   s   t dS )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        N)r   )r   r   r   r   r   feed%   s    zMessDetectorPlugin.feed)r   c             C   s   t dS )zB
        Permit to reset the plugin to the initial state.
        N)r   )r   r   r   r   reset,   s    zMessDetectorPlugin.resetc             C   s   t dS )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        N)r   )r   r   r   r   ratio2   s    zMessDetectorPlugin.ratio)__name__
__module____qualname____doc__strboolr   r    r!   propertyfloatr"   r   r   r   r   r      s   r   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS ) TooManySymbolOrPunctuationPluginN)r   c             C   s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_charZ_frenzy_symbol_in_word)r   r   r   r   __init__<   s
    z)TooManySymbolOrPunctuationPlugin.__init__)r   r   c             C   s   |  S )N)isprintable)r   r   r   r   r   r   D   s    z)TooManySymbolOrPunctuationPlugin.eligiblec             C   sp   |  j d7  _ || jkrf|tkrft|r8|  jd7  _n.| dkrft|rft|dkrf|  jd7  _|| _d S )Nr   F   )	r.   r/   r   r   r,   isdigitr   r   r-   )r   r   r   r   r   r    G   s    
z%TooManySymbolOrPunctuationPlugin.feedc             C   s   d| _ d| _d| _d S )Nr   )r,   r.   r-   )r   r   r   r   r!   Y   s    z&TooManySymbolOrPunctuationPlugin.resetc             C   s0   | j dkrdS | j| j | j  }|dkr,|S dS )Nr   g        g333333?)r.   r,   r-   )r   Zratio_of_punctuationr   r   r   r"   ^   s
    

z&TooManySymbolOrPunctuationPlugin.ratio)r#   r$   r%   r0   r'   r(   r   r    r!   r)   r*   r"   r   r   r   r   r+   ;   s   r+   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )TooManyAccentuatedPluginN)r   c             C   s   d| _ d| _d S )Nr   )r.   _accentuated_count)r   r   r   r   r0   k   s    z!TooManyAccentuatedPlugin.__init__)r   r   c             C   s   |  S )N)isalpha)r   r   r   r   r   r   o   s    z!TooManyAccentuatedPlugin.eligiblec             C   s(   |  j d7  _ t|r$|  jd7  _d S )Nr   )r.   r   r5   )r   r   r   r   r   r    r   s    zTooManyAccentuatedPlugin.feedc             C   s   d| _ d| _d S )Nr   )r.   r5   )r   r   r   r   r!   x   s    zTooManyAccentuatedPlugin.resetc             C   s*   | j dkrdS | j| j  }|dkr&|S dS )Nr   g        gffffff?)r.   r5   )r   Zratio_of_accentuationr   r   r   r"   |   s    
zTooManyAccentuatedPlugin.ratio)r#   r$   r%   r0   r'   r(   r   r    r!   r)   r*   r"   r   r   r   r   r4   j   s   r4   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )UnprintablePluginN)r   c             C   s   d| _ d| _d S )Nr   )_unprintable_countr.   )r   r   r   r   r0      s    zUnprintablePlugin.__init__)r   r   c             C   s   dS )NTr   )r   r   r   r   r   r      s    zUnprintablePlugin.eligiblec             C   s(   t |r|  jd7  _|  jd7  _d S )Nr   )r   r8   r.   )r   r   r   r   r   r       s    zUnprintablePlugin.feedc             C   s
   d| _ d S )Nr   )r8   )r   r   r   r   r!      s    zUnprintablePlugin.resetc             C   s   | j dkrdS | jd | j  S )Nr   g           )r.   r8   )r   r   r   r   r"      s    
zUnprintablePlugin.ratio)r#   r$   r%   r0   r'   r(   r   r    r!   r)   r*   r"   r   r   r   r   r7      s   r7   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuspiciousDuplicateAccentPluginN)r   c             C   s   d| _ d| _d | _d S )Nr   )_successive_countr.   _last_latin_character)r   r   r   r   r0      s    z(SuspiciousDuplicateAccentPlugin.__init__)r   r   c             C   s   |  ot|S )N)r6   r   )r   r   r   r   r   r      s    z(SuspiciousDuplicateAccentPlugin.eligiblec             C   st   |  j d7  _ | jd k	rjt|rjt| jrj| rJ| j rJ|  jd7  _t|t| jkrj|  jd7  _|| _d S )Nr   )r.   r<   r   isupperr;   r   )r   r   r   r   r   r       s    

z$SuspiciousDuplicateAccentPlugin.feedc             C   s   d| _ d| _d | _d S )Nr   )r;   r.   r<   )r   r   r   r   r!      s    z%SuspiciousDuplicateAccentPlugin.resetc             C   s   | j dkrdS | jd | j  S )Nr   g        r2   )r.   r;   )r   r   r   r   r"      s    
z%SuspiciousDuplicateAccentPlugin.ratio)r#   r$   r%   r0   r'   r(   r   r    r!   r)   r*   r"   r   r   r   r   r:      s   r:   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuspiciousRangeN)r   c             C   s   d| _ d| _d | _d S )Nr   )"_suspicious_successive_range_countr.   _last_printable_seen)r   r   r   r   r0      s    zSuspiciousRange.__init__)r   r   c             C   s   |  S )N)r1   )r   r   r   r   r   r      s    zSuspiciousRange.eligiblec             C   sx   |  j d7  _ | s&t|s&|tkr0d | _d S | jd krD|| _d S t| j}t|}t||rn|  jd7  _|| _d S )Nr   )r.   isspacer   r   r@   r    is_suspiciously_successive_ranger?   )r   r   unicode_range_aunicode_range_br   r   r   r       s    


zSuspiciousRange.feedc             C   s   d| _ d| _d | _d S )Nr   )r.   r?   r@   )r   r   r   r   r!      s    zSuspiciousRange.resetc             C   s.   | j dkrdS | jd | j  }|dk r*dS |S )Nr   g        r2   g?)r.   r?   )r   Zratio_of_suspicious_range_usager   r   r   r"      s    
zSuspiciousRange.ratio)r#   r$   r%   r0   r'   r(   r   r    r!   r)   r*   r"   r   r   r   r   r>      s   r>   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )SuperWeirdWordPluginN)r   c             C   s:   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d S )Nr   F )	_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr.   _bad_character_count_buffer_buffer_accent_count)r   r   r   r   r0      s    zSuperWeirdWordPlugin.__init__)r   r   c             C   s   dS )NTr   )r   r   r   r   r   r     s    zSuperWeirdWordPlugin.eligiblec             C   s  |  r|  j|7  _t|r,|  jd7  _| jdkrt|dksJt|rt|dkrt|dkrt|dkrt	|dkrt
|dkrd| _d S | jsd S | st|st|r| jr|  jd7  _t| j}|  j|7  _|dkr6| j| dkrd| _t| jd r6| jd  r6|  jd7  _d| _|dkr\| jr\|  jd7  _d| _| jr|  jd7  _|  jt| j7  _d| _d| _d| _d	| _n6|d
kr| dkrt|rd| _|  j|7  _d S )Nr   FT   g(\?   rF   r   >   =|-><_~)r6   rM   r   rN   rK   r   r   r   r   r   r   rA   r   r   rG   lenr.   rJ   r=   rI   rH   rL   r3   r   )r   r   Zbuffer_lengthr   r   r   r      sR    


 

zSuperWeirdWordPlugin.feedc             C   s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )NrF   Fr   )rM   rJ   rK   rH   rG   r.   rL   rI   )r   r   r   r   r!   =  s    zSuperWeirdWordPlugin.resetc             C   s$   | j dkr| jdkrdS | j| j S )N
   r   g        )rG   rI   rL   r.   )r   r   r   r   r"   G  s    zSuperWeirdWordPlugin.ratio)r#   r$   r%   r0   r'   r(   r   r    r!   r)   r*   r"   r   r   r   r   rE      s   6
rE   c               @   s^   e Zd ZdZddddZeedddZeddd	d
ZddddZ	e
edddZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    N)r   c             C   s   d| _ d| _d S )Nr   )_wrong_stop_count_cjk_character_count)r   r   r   r   r0   U  s    zCjkInvalidStopPlugin.__init__)r   r   c             C   s   dS )NTr   )r   r   r   r   r   r   Y  s    zCjkInvalidStopPlugin.eligiblec             C   s4   |dkr|  j d7  _ d S t|r0|  jd7  _d S )N>      丅   丄r   )r\   r   r]   )r   r   r   r   r   r    \  s
    zCjkInvalidStopPlugin.feedc             C   s   d| _ d| _d S )Nr   )r\   r]   )r   r   r   r   r!   c  s    zCjkInvalidStopPlugin.resetc             C   s   | j dk rdS | j| j  S )N   g        )r]   r\   )r   r   r   r   r"   g  s    
zCjkInvalidStopPlugin.ratio)r#   r$   r%   r&   r0   r'   r(   r   r    r!   r)   r*   r"   r   r   r   r   r[   O  s   r[   c               @   sZ   e Zd ZddddZeedddZedddd	Zddd
dZe	e
dddZdS )ArchaicUpperLowerPluginN)r   c             C   s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr.   _last_alpha_seen_current_ascii_only)r   r   r   r   r0   o  s    z ArchaicUpperLowerPlugin.__init__)r   r   c             C   s   dS )NTr   )r   r   r   r   r   r   |  s    z ArchaicUpperLowerPlugin.eligiblec             C   s$  |  ot|}|dk}|r| jdkr| jdkrV| dkrV| jdkrV|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdkrt
|dkrd| _| jd k	r| r| j s| r| j r| jdkr|  jd7  _d| _qd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r   Tr2   )r6   r
   rc   r3   rg   re   rd   rf   rb   r.   r	   r=   islower)r   r   Zis_concernedZ	chunk_sepr   r   r   r      s8    




zArchaicUpperLowerPlugin.feedc             C   s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r.   rc   rd   re   rf   rb   rg   )r   r   r   r   r!     s    zArchaicUpperLowerPlugin.resetc             C   s   | j dkrdS | j| j  S )Nr   g        )r.   re   )r   r   r   r   r"     s    
zArchaicUpperLowerPlugin.ratio)r#   r$   r%   r0   r'   r(   r   r    r!   r)   r*   r"   r   r   r   r   ra   n  s   *	ra   i   )maxsize)rC   rD   r   c             C   s~  | dks|dkrdS | |kr dS d| kr4d|kr4dS d| ksDd|krHdS d| ksXd|krld| kshd|krldS |  d| d }}x"|D ]}|tkrq||krdS qW | dk|dk }}|s|rd	| ksd	|krdS |r|rdS d
| ksd
|kr"d	| ksd	|kr
dS | dks|dkr"dS d	| ksJd	|ksJ| dkrz|dkrzd| ks^d|krbdS d| ksvd|krzdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFZLatinZ	EmoticonsZ	Combining )HiraganaKatakanaCJKZHangulzBasic Latin)rm   rl   ZPunctuationZForms)splitr   )rC   rD   Zkeywords_range_aZkeywords_range_belZrange_a_jp_charsZrange_b_jp_charsr   r   r   rB     sP    


rB   i   皙?F)decoded_sequencemaximum_thresholddebugr   c             C   s   dd t  D }t| d }d}|dk r0d}n|dkr>d}nd	}x|t| d
 t|D ]f\}}x |D ]}	|	|rd|	| qdW |dkr|| dks||d krVtdd |D }||krVP qVW |rx|D ]}
t|
j	|
j
 qW t|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c             S   s   g | ]
}| qS r   r   ).0Zmd_classr   r   r   
<listcomp>
  s    zmess_ratio.<locals>.<listcomp>r   g        i       i   rh      
r   c             s   s   | ]}|j V  qd S )N)r"   )ru   dtr   r   r   	<genexpr>   s    zmess_ratio.<locals>.<genexpr>   )r   __subclasses__rY   zipranger   r    sumprint	__class__r"   round)rr   rs   rt   Z	detectorslengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcr   indexdetectorrz   r   r   r   
mess_ratio  s*    	


r   N)rq   F)'	functoolsr   typingr   r   Zconstantr   r   utilsr   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r+   r4   r7   r:   r>   rE   r[   ra   r'   r(   rB   r*   r   r   r   r   r   <module>   s$   H"/%4ZLD