U
    3g0                     @   s   d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZmZ ddlmZmZ ddlmZmZmZ G d	d
 d
ZG dd dZeeef Zee ZG dd dZdS )    )aliases)sha256)dumps)sub)AnyDictIteratorListOptionalTupleUnion   )RE_POSSIBLE_ENCODING_INDICATIONTOO_BIG_SEQUENCE)	iana_nameis_multi_byte_encodingunicode_rangec                	   @   s  e Zd Zd8eeeedee ee dddZe	edddZ
e	edd	d
ZeedddZedddZedddZd ddddZeedddZeee dddZeedddZeedddZeee dddZeedddZeedd d!Zeedd"d#Zeedd$d%Zeedd&d'Zeedd(d)Zeed  dd*d+Zeedd,d-Zeee dd.d/Zeee dd0d1Z d9eed3d4d5Z!eedd6d7Z"dS ):CharsetMatchNCoherenceMatches)payloadguessed_encodingmean_mess_ratiohas_sig_or_bom	languagesdecoded_payloadpreemptive_declarationc                 C   sL   || _ || _|| _|| _|| _d | _g | _d| _d | _d | _	|| _
|| _d S )N        )_payload	_encoding_mean_mess_ratio
_languages_has_sig_or_bom_unicode_ranges_leavesZ_mean_coherence_ratio_output_payload_output_encoding_string_preemptive_declaration)selfr   r   r   r   r   r   r    r)   =/tmp/pip-unpacked-wheel-t7ubxsek/charset_normalizer/models.py__init__   s    
zCharsetMatch.__init__)otherreturnc                 C   s>   t |ts&t |tr"t|| jkS dS | j|jko<| j|jkS )NF)
isinstancer   strr   encodingfingerprintr(   r,   r)   r)   r*   __eq__(   s
    

zCharsetMatch.__eq__c                 C   s   t |tstt| j|j }t| j|j }|dk rJ|dkrJ| j|jkS |dk r|dkrt| jtkrt| j|jk S | j	|j	kS | j|jk S )zQ
        Implemented to make sorted available upon CharsetMatches items.
        g{Gz?g{Gz?)
r.   r   
ValueErrorabschaos	coherencelenr   r   multi_byte_usage)r(   r,   Zchaos_differenceZcoherence_differencer)   r)   r*   __lt__/   s    
zCharsetMatch.__lt__r-   c                 C   s   dt t| t | j  S )Ng      ?)r8   r/   rawr(   r)   r)   r*   r9   E   s    zCharsetMatch.multi_byte_usagec                 C   s"   | j d krt| j| jd| _ | j S )Nstrict)r&   r/   r   r   r=   r)   r)   r*   __str__I   s    
zCharsetMatch.__str__c                 C   s   d | j| jS )Nz<CharsetMatch '{}' bytes({})>)formatr0   r1   r=   r)   r)   r*   __repr__O   s    zCharsetMatch.__repr__c                 C   s8   t |tr|| kr"td|jd |_| j| d S )Nz;Unable to add instance <{}> as a submatch of a CharsetMatch)r.   r   r4   r@   	__class__r&   r#   appendr2   r)   r)   r*   add_submatchR   s    zCharsetMatch.add_submatchc                 C   s   | j S N)r   r=   r)   r)   r*   r0   ]   s    zCharsetMatch.encodingc                 C   sD   g }t  D ]2\}}| j|kr*|| q| j|kr|| q|S )z
        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
        )r   itemsr0   rC   )r(   Zalso_known_asupr)   r)   r*   encoding_aliasesa   s    

zCharsetMatch.encoding_aliasesc                 C   s   | j S rE   r!   r=   r)   r)   r*   bomn   s    zCharsetMatch.bomc                 C   s   | j S rE   rJ   r=   r)   r)   r*   byte_order_markr   s    zCharsetMatch.byte_order_markc                 C   s   dd | j D S )z
        Return the complete list of possible languages found in decoded sequence.
        Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
        c                 S   s   g | ]}|d  qS )r   r)   ).0er)   r)   r*   
<listcomp>|   s     z*CharsetMatch.languages.<locals>.<listcomp>r    r=   r)   r)   r*   r   v   s    zCharsetMatch.languagesc                 C   sp   | j sbd| jkrdS ddlm}m} t| jr8|| jn|| j}t|dksVd|krZdS |d S | j d d S )z
        Most probable language found in decoded sequence. If none were detected or inferred, the property will return
        "Unknown".
        asciiZEnglishr   )encoding_languagesmb_encoding_languageszLatin BasedUnknown)r    could_be_from_charsetZcharset_normalizer.cdrR   rS   r   r0   r8   )r(   rR   rS   r   r)   r)   r*   language~   s    
zCharsetMatch.languagec                 C   s   | j S rE   )r   r=   r)   r)   r*   r6      s    zCharsetMatch.chaosc                 C   s   | j s
dS | j d d S )Nr   r   r   rP   r=   r)   r)   r*   r7      s    zCharsetMatch.coherencec                 C   s   t | jd ddS Nd      )ndigits)roundr6   r=   r)   r)   r*   percent_chaos   s    zCharsetMatch.percent_chaosc                 C   s   t | jd ddS rW   )r[   r7   r=   r)   r)   r*   percent_coherence   s    zCharsetMatch.percent_coherencec                 C   s   | j S )z+
        Original untouched bytes.
        )r   r=   r)   r)   r*   r<      s    zCharsetMatch.rawc                 C   s   | j S rE   )r#   r=   r)   r)   r*   submatch   s    zCharsetMatch.submatchc                 C   s   t | jdkS Nr   )r8   r#   r=   r)   r)   r*   has_submatch   s    zCharsetMatch.has_submatchc                 C   s@   | j d k	r| j S dd t| D }ttdd |D | _ | j S )Nc                 S   s   g | ]}t |qS r)   )r   )rM   charr)   r)   r*   rO      s    z*CharsetMatch.alphabets.<locals>.<listcomp>c                 S   s   h | ]}|r|qS r)   r)   )rM   rr)   r)   r*   	<setcomp>   s      z)CharsetMatch.alphabets.<locals>.<setcomp>)r"   r/   sortedlist)r(   Zdetected_rangesr)   r)   r*   	alphabets   s    
zCharsetMatch.alphabetsc                 C   s   | j gdd | jD  S )z
        The complete list of encoding that output the exact SAME str result and therefore could be the originating
        encoding.
        This list does include the encoding available in property 'encoding'.
        c                 S   s   g | ]
}|j qS r)   )r0   )rM   mr)   r)   r*   rO      s     z6CharsetMatch.could_be_from_charset.<locals>.<listcomp>)r   r#   r=   r)   r)   r*   rU      s    z"CharsetMatch.could_be_from_charsetutf_8)r0   r-   c                    s|    j dks j |krv| _ t } jdk	rh j dkrhtt fdd|dd d}||dd  }||d _ jS )z
        Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
        Any errors will be simply ignored by the encoder NOT replaced.
        N)zutf-8utf8rh   c                    s4   | j |  d |  d  |  d t jS )Nr   r   )stringspanreplacegroupsr   r%   )rg   r=   r)   r*   <lambda>   s   
 z%CharsetMatch.output.<locals>.<lambda>i    r   rl   )r%   r/   r'   lowerr   r   encoder$   )r(   r0   decoded_stringZpatched_headerr)   r=   r*   output   s$    

	zCharsetMatch.outputc                 C   s   t |   S )zw
        Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
        )r   rr   	hexdigestr=   r)   r)   r*   r1      s    zCharsetMatch.fingerprint)NN)rh   )#__name__
__module____qualname__bytesr/   floatboolr
   r+   objectr3   r:   propertyr9   r?   rA   rD   r0   r	   rI   rK   rL   r   rV   r6   r7   r\   r]   r<   r^   r`   rf   rU   rr   r1   r)   r)   r)   r*   r      sf     r   c                   @   s   e Zd ZdZdeee  dddZee dddZ	e
eef ed	d
dZedddZedddZedd	ddZed dddZed dddZdS )CharsetMatchesz
    Container with every CharsetMatch items ordered by default from most probable to the less one.
    Act like a list(iterable) but does not implements all related methods.
    N)resultsc                 C   s   |rt |ng | _d S rE   )rd   _results)r(   r}   r)   r)   r*   r+      s    zCharsetMatches.__init__r;   c                 c   s   | j E d H  d S rE   r~   r=   r)   r)   r*   __iter__   s    zCharsetMatches.__iter__)itemr-   c                 C   sN   t |tr| j| S t |trFt|d}| jD ]}||jkr.|  S q.tdS )z
        Retrieve a single item either by its position or encoding name (alias may be used here).
        Raise KeyError upon invalid index or encoding not present in results.
        FN)r.   intr~   r/   r   rU   KeyError)r(   r   resultr)   r)   r*   __getitem__   s    






zCharsetMatches.__getitem__c                 C   s
   t | jS rE   r8   r~   r=   r)   r)   r*   __len__  s    zCharsetMatches.__len__c                 C   s   t | jdkS r_   r   r=   r)   r)   r*   __bool__  s    zCharsetMatches.__bool__c                 C   s|   t |tstdt|jt|jtk r`| j	D ],}|j
|j
kr2|j|jkr2||  dS q2| j	| t| j	| _	dS )z~
        Insert a single match. Will be inserted accordingly to preserve sort.
        Can be inserted as a submatch.
        z-Cannot append instance '{}' to CharsetMatchesN)r.   r   r4   r@   r/   rB   r8   r<   r   r~   r1   r6   rD   rC   rd   )r(   r   matchr)   r)   r*   rC     s    


zCharsetMatches.appendr   c                 C   s   | j s
dS | j d S )zQ
        Simply return the first match. Strict equivalent to matches[0].
        Nr   r   r=   r)   r)   r*   best(  s    zCharsetMatches.bestc                 C   s   |   S )zP
        Redundant method, call the method best(). Kept for BC reasons.
        )r   r=   r)   r)   r*   first0  s    zCharsetMatches.first)N)rt   ru   rv   __doc__r
   r	   r   r+   r   r   r   r   r/   r   r   ry   r   rC   r   r   r)   r)   r)   r*   r|      s   r|   c                   @   sj   e Zd Zeee ee ee eee eeeee edddZe	e
eef dddZedddZd	S )
CliDetectionResultpathr0   rI   alternative_encodingsrV   rf   r   r6   r7   unicode_pathis_preferredc                 C   sF   || _ |
| _|| _|| _|| _|| _|| _|| _|| _|	| _	|| _
d S rE   )r   r   r0   rI   r   rV   rf   r   r6   r7   r   )r(   r   r0   rI   r   rV   rf   r   r6   r7   r   r   r)   r)   r*   r+   <  s    zCliDetectionResult.__init__r;   c                 C   s2   | j | j| j| j| j| j| j| j| j| j	| j
dS )Nr   r   r=   r)   r)   r*   __dict__V  s    zCliDetectionResult.__dict__c                 C   s   t | jdddS )NT   )ensure_asciiindent)r   r   r=   r)   r)   r*   to_jsonf  s    zCliDetectionResult.to_jsonN)rt   ru   rv   r/   r
   r	   ry   rx   r+   r{   r   r   r   r   r)   r)   r)   r*   r   ;  s   r   N)Zencodings.aliasesr   hashlibr   jsonr   rer   typingr   r   r   r	   r
   r   r   Zconstantr   r   utilsr   r   r   r   r|   r/   rx   ZCoherenceMatchr   r   r)   r)   r)   r*   <module>   s   $ jC