3 Pf$+@sddlmZmZmZddlmZddlmZddl m Z ddl m Z ddl m Z m Z ddl mZmZmZdd l mZmZdd l mZdd lmZdd lmZee ZGd ddeZdS))absolute_importdivisionunicode_literals)unichr)deque)spaceCharacters)entities) asciiLettersasciiUpper2Lower)digits hexDigitsEOF) tokenTypes tagTokenTypes)replacementCharacters)HTMLInputStream)TriecsdeZdZdZdfdd ZddZddZdd d Zd d ZddZ ddZ ddZ ddZ ddZ ddZddZddZddZd d!Zd"d#Zd$d%Zd&d'Zd(d)Zd*d+Zd,d-Zd.d/Zd0d1Zd2d3Zd4d5Zd6d7Zd8d9Zd:d;Zdd?Z!d@dAZ"dBdCZ#dDdEZ$dFdGZ%dHdIZ&dJdKZ'dLdMZ(dNdOZ)dPdQZ*dRdSZ+dTdUZ,dVdWZ-dXdYZ.dZd[Z/d\d]Z0d^d_Z1d`daZ2dbdcZ3dddeZ4dfdgZ5dhdiZ6djdkZ7dldmZ8dndoZ9dpdqZ:drdsZ;dtduZdzd{Z?d|d}Z@d~dZAddZBddZCddZDddZEddZFddZGddZHddZIddZJddZKddZLZMS) HTMLTokenizera  This class takes care of tokenizing HTML. * self.currentToken Holds the token that is currently being processed. * self.state Holds a reference to the method to be invoked... XXX * self.stream Points to HTMLInputStream object. Nc sFt|f||_||_d|_g|_|j|_d|_d|_t t |j dS)NF) rstreamparserZ escapeFlagZ lastFourChars dataStatestateescape currentTokensuperr__init__)selfrrkwargs) __class__ /usr/lib/python3.6/_tokenizer.pyr"szHTMLTokenizer.__init__ccs\tg|_xL|jrVx&|jjr:td|jjjddVqWx|jrR|jjVq>Wq WdS)z This is where the magic happens. We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested. ParseErrorr)typedataN)r tokenQueuerrerrorsrpoppopleft)rr r r!__iter__1s    zHTMLTokenizer.__iter__c %Cs(t}d}|rt}d}g}|jj}x(||krJ|tk rJ|j||jj}q$Wtdj||}|tkrt|}|j jt ddd|idnld|kod kns|d krd }|j jt ddd|idn(d |kod knsd|kodknsd|kodknsd|ko4dkns|t ddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d g#kr|j jt ddd|idy t |}Wn>t k r|d6}t d|d?Bt d7|d8@B}YnX|d9kr$|j jt dd:d;|jj||S)r"z'expected-tag-name-but-got-right-bracket)r#r$rJz<>?z'expected-tag-name-but-got-question-markzexpected-tag-namerDT)rr6markupDeclarationOpenStatercloseTagOpenStater rr tagNameStater%r7rr=bogusCommentState)rr$r r r!r]is6               zHTMLTokenizer.tagOpenStatecCs|jj}|tkr0td|gdd|_|j|_n|dkrX|jjtddd|j |_nn|t kr|jjtddd|jjtd d d|j |_n0|jjtdd d |id |jj ||j |_dS)NrVF)r#rUr$rWrjr"z*expected-closing-tag-but-got-right-bracket)r#r$z expected-closing-tag-but-got-eofrJz|tkr|jjtdd d|j |_n|jjtd|dd S) Nr|rJ)r#r$rDr[r"zinvalid-codepointu�zeof-in-script-in-scriptT) rr6r%r7r scriptDataDoubleEscapedDashStater(scriptDataDoubleEscapedLessThanSignStaterr)rr$r r r!rs$          z*HTMLTokenizer.scriptDataDoubleEscapedStatecCs|jj}|dkr2|jjtddd|j|_n|dkrZ|jjtddd|j|_n|dkr|jjtddd|jjtddd|j|_nF|t kr|jjtdd d|j |_n|jjtd|d|j|_d S) Nr|rJ)r#r$rDr[r"zinvalid-codepointu�zeof-in-script-in-scriptT) rr6r%r7r$scriptDataDoubleEscapedDashDashStaterrrrr)rr$r r r!rs(           z.HTMLTokenizer.scriptDataDoubleEscapedDashStatecCs|jj}|dkr*|jjtdddn|dkrR|jjtddd|j|_n|dkrz|jjtddd|j|_n|dkr|jjtddd|jjtdd d|j|_nF|t kr|jjtdd d|j |_n|jjtd|d|j|_d S) Nr|rJ)r#r$rDrjr[r"zinvalid-codepointu�zeof-in-script-in-scriptT) rr6r%r7rrrrfrrr)rr$r r r!rs,           z2HTMLTokenizer.scriptDataDoubleEscapedDashDashStatecCsP|jj}|dkr8|jjtdddd|_|j|_n|jj||j |_dS)NrirJ)r#r$r,T) rr6r%r7rrrscriptDataDoubleEscapeEndStaterr=r)rr$r r r!r0s   z6HTMLTokenizer.scriptDataDoubleEscapedLessThanSignStatecCs|jj}|ttdBkrR|jjtd|d|jjdkrH|j |_ q|j |_ nB|t kr|jjtd|d|j|7_n|jj ||j |_ dS)NrirjrJ)r#r$rT)rirj)rr6rr:r%r7rrrrurrrr r=)rr$r r r!r;s    z,HTMLTokenizer.scriptDataDoubleEscapeEndStatecCs0|jj}|tkr$|jjtdn|tkrJ|jdj|dg|j|_n|dkr\|j n|dkrn|j |_n|dkr|j jt d d d |jdj|dg|j|_n|d kr|j jt d dd |jdjddg|j|_nF|t kr|j jt d dd |j|_n|jdj|dg|j|_dS)NTr$r,rjri'"rHrDr"z#invalid-character-in-attribute-name)r#r$r[zinvalid-codepointu�z#expected-attribute-name-but-got-eof)rrrHrD)rr6rr^r rr7attributeNameStaterrZrqr%rrr)rr$r r r!rpKs6              z&HTMLTokenizer.beforeAttributeNameStatecCs|jj}d}d}|dkr&|j|_n0|tkr^|jddd||jjtd7<d}n|dkrld}n|tkr~|j|_n|dkr|j |_n|d kr|j j t d d d |jdddd 7<d}n|dkr|j j t d dd |jddd|7<d}nH|t kr8|j j t d dd |j|_n|jddd|7<d}|r|jdddjt|jddd<xP|jdddD]:\}}|jddd|kr|j j t d dd PqW|r|jdS)NTFrHr$rrrjrir[r"zinvalid-codepoint)r#r$u�rrrDz#invalid-character-in-attribute-namezeof-in-attribute-namezduplicate-attributerKrK)rrrDrKrKrKrKrKrK)rr6beforeAttributeValueStaterr rr^rafterAttributeNameStaterqr%r7rrrrXr rZ)rr$ZleavingThisStateZ emitTokenrU_r r r!risR             &  z HTMLTokenizer.attributeNameStatecCsF|jj}|tkr$|jjtdn|dkr8|j|_n |dkrJ|jn|tkrp|jdj |dg|j |_n|dkr|j |_n|dkr|j j t dd d |jdj d dg|j |_n|dkr|j j t ddd |jdj |dg|j |_nF|tkr&|j j t ddd |j|_n|jdj |dg|j |_dS)NTrHrjr$r,rir[r"zinvalid-codepoint)r#r$u�rrrDz&invalid-character-after-attribute-namezexpected-end-of-tag-but-got-eof)rrrD)rr6rr^rrrZr rr7rrqr%rrr)rr$r r r!rs:                z%HTMLTokenizer.afterAttributeNameStatecCsj|jj}|tkr$|jjtdnB|dkr8|j|_n.|dkrX|j|_|jj|n|dkrl|j|_n|dkr|j j t ddd|j n|d kr|j j t dd d|j d dd d 7<|j|_n|dkr|j j t ddd|j d dd |7<|j|_nL|tkrD|j j t ddd|j|_n"|j d dd |7<|j|_dS)NTrrCrrjr"z.expected-attribute-value-but-got-right-bracket)r#r$r[zinvalid-codepointr$ru�rHrD`z"equals-in-unquoted-attribute-valuez$expected-attribute-value-but-got-eofrK)rHrDrrKrK)rr6rr^attributeValueDoubleQuotedStaterattributeValueUnQuotedStater=attributeValueSingleQuotedStater%r7rrZrrr)rr$r r r!rs>                 z'HTMLTokenizer.beforeAttributeValueStatecCs|jj}|dkr|j|_n|dkr0|jdn|dkrj|jjtddd|jdd dd 7<nN|t kr|jjtdd d|j |_n&|jdd d||jj d7<d S)NrrCr[r"zinvalid-codepoint)r#r$r$ru�z#eof-in-attribute-value-double-quoteTrKrK)rrCr[) rr6afterAttributeValueStaterrTr%r7rrrrr^)rr$r r r!rs         z-HTMLTokenizer.attributeValueDoubleQuotedStatecCs|jj}|dkr|j|_n|dkr0|jdn|dkrj|jjtddd|jdd dd 7<nN|t kr|jjtdd d|j |_n&|jdd d||jj d7<d S)NrrCr[r"zinvalid-codepoint)r#r$r$ru�z#eof-in-attribute-value-single-quoteTrKrK)rrCr[) rr6rrrTr%r7rrrrr^)rr$r r r!rs         z-HTMLTokenizer.attributeValueSingleQuotedStatecCs|jj}|tkr|j|_n|dkr2|jdn|dkrD|jn|dkr~|jjt dd d |j d dd |7<n|d kr|jjt ddd |j d dd d7<nV|t kr|jjt ddd |j |_n.|j d dd ||jj tdtB7<dS)NrCrjrrrHrDrr"z0unexpected-character-in-unquoted-attribute-value)r#r$r$rr[zinvalid-codepointu�z eof-in-attribute-value-no-quotesT)rrrHrDrrKrKrK)rCrjrrrHrDrr[)rr6rrprrTrZr%r7rrrrr^r:)rr$r r r!rs,           z)HTMLTokenizer.attributeValueUnQuotedStatecCs|jj}|tkr|j|_n|dkr.|jnp|dkr@|j|_n^|tkrt|jj t ddd|jj ||j |_n*|jj t ddd|jj ||j|_dS)Nrjrir"z$unexpected-EOF-after-attribute-value)r#r$z*unexpected-character-after-attribute-valueT) rr6rrprrZrqrr%r7rr=r)rr$r r r!r s"           z&HTMLTokenizer.afterAttributeValueStatecCs|jj}|dkr&d|jd<|jn^|tkrZ|jjtddd|jj||j |_ n*|jjtddd|jj||j |_ dS)NrjTrWr"z#unexpected-EOF-after-solidus-in-tag)r#r$z)unexpected-character-after-solidus-in-tag) rr6rrZrr%r7rr=rrrp)rr$r r r!rq4s          z&HTMLTokenizer.selfClosingStartTagStatecCsD|jjd}|jdd}|jjtd|d|jj|j|_dS)Nrjr[u�Comment)r#r$T) rr^replacer%r7rr6rr)rr$r r r!roFs   zHTMLTokenizer.bogusCommentStatecCs|jjg}|ddkrT|j|jj|ddkrPtddd|_|j|_dSn|ddkrd}x.d&D]&}|j|jj|d'|krjd}PqjW|rtdddddd|_|j|_dSn|d(dkrH|jdk rH|jj j rH|jj j d)j |jj j krHd}x2d*D]*}|j|jj|d+|krd}PqW|rH|j |_dS|jjtdddx|rz|jj|jq`W|j|_dS),Nrr|rr,)r#r$TdDoOr@CtTyYpPeEFZDoctype)r#rUpublicIdsystemIdcorrect[Ar"zexpected-dashes-or-doctyperKrKrK)rrrrr@rrrrrrrrr)rrrrrrrKrKrK)rrrrrrrK)rr6r7rrcommentStartStater doctypeStaterZtreeZ openElements namespaceZdefaultNamespacecdataSectionStater%r=r'ro)rr?matchedexpectedr r r!rlUsR           z(HTMLTokenizer.markupDeclarationOpenStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<n|dkr|jjtdd d|jj|j|j|_nP|t kr|jjtdd d|jj|j|j|_n|jd|7<|j |_d S) Nr|r[r"zinvalid-codepoint)r#r$r$u�rjzincorrect-commentzeof-in-commentT) rr6commentStartDashStaterr%r7rrrr commentState)rr$r r r!rs(          zHTMLTokenizer.commentStartStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<n|dkr|jjtdd d|jj|j|j|_nT|t kr|jjtdd d|jj|j|j|_n|jdd|7<|j |_d S) Nr|r[r"zinvalid-codepoint)r#r$r$u-�rjzincorrect-commentzeof-in-commentT) rr6commentEndStaterr%r7rrrrr)rr$r r r!rs(          z#HTMLTokenizer.commentStartDashStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<nT|tkr|jjtddd|jj|j|j |_n|jd||jj d 7<d S) Nr|r[r"zinvalid-codepoint)r#r$r$u�zeof-in-commentT)r|r[) rr6commentEndDashStaterr%r7rrrrr^)rr$r r r!rs        zHTMLTokenizer.commentStatecCs|jj}|dkr|j|_n|dkrV|jjtddd|jdd7<|j|_nT|t kr|jjtddd|jj|j|j |_n|jdd|7<|j|_d S) Nr|r[r"zinvalid-codepoint)r#r$r$u-�zeof-in-comment-end-dashT) rr6rrr%r7rrrrr)rr$r r r!rs         z!HTMLTokenizer.commentEndDashStatecCs,|jj}|dkr*|jj|j|j|_n|dkrd|jjtddd|jdd7<|j|_n|dkr|jjtdd d|j |_n|d kr|jjtdd d|jd|7<nj|t kr|jjtdd d|jj|j|j|_n4|jjtdd d|jdd|7<|j|_dS)Nrjr[r"zinvalid-codepoint)r#r$r$u--�rhz,unexpected-bang-after-double-dash-in-commentr|z,unexpected-dash-after-double-dash-in-commentzeof-in-comment-double-dashzunexpected-char-in-commentz--T) rr6r%r7rrrrrcommentEndBangStater)rr$r r r!rs6               zHTMLTokenizer.commentEndStatecCs|jj}|dkr*|jj|j|j|_n|dkrN|jdd7<|j|_n|dkr|jjtddd|jdd 7<|j |_nT|t kr|jjtdd d|jj|j|j|_n|jdd|7<|j |_d S) Nrjr|r$z--!r[r"zinvalid-codepoint)r#r$u--!�zeof-in-comment-end-bang-stateT) rr6r%r7rrrrrrr)rr$r r r!rs(         z!HTMLTokenizer.commentEndBangStatecCs|jj}|tkr|j|_nj|tkr\|jjtdddd|j d<|jj|j |j |_n*|jjtddd|jj ||j|_dS)Nr"z!expected-doctype-name-but-got-eof)r#r$Frzneed-space-after-doctypeT) rr6rbeforeDoctypeNameStaterrr%r7rrrr=)rr$r r r!r s         zHTMLTokenizer.doctypeStatecCs|jj}|tkrn|dkrT|jjtdddd|jd<|jj|j|j|_n|dkr|jjtdddd |jd <|j |_nR|t kr|jjtdd dd|jd<|jj|j|j|_n||jd <|j |_d S) Nrjr"z+expected-doctype-name-but-got-right-bracket)r#r$Frr[zinvalid-codepointu�rUz!expected-doctype-name-but-got-eofT) rr6rr%r7rrrrdoctypeNameStater)rr$r r r!rs.              z$HTMLTokenizer.beforeDoctypeNameStatecCs|jj}|tkr2|jdjt|jd<|j|_n|dkrh|jdjt|jd<|jj |j|j |_n|dkr|jj t ddd|jdd7<|j |_nh|t kr|jj t dddd |jd <|jdjt|jd<|jj |j|j |_n|jd|7<d S) NrUrjr[r"zinvalid-codepoint)r#r$u�zeof-in-doctype-nameFrT)rr6rrrXr afterDoctypeNameStaterr%r7rrrr)rr$r r r!r6s,          zHTMLTokenizer.doctypeNameStatecCsR|jj}|tkrn8|dkr8|jj|j|j|_n|tkrd|jd<|jj ||jjt ddd|jj|j|j|_n|d!krd }x$d'D]}|jj}||krd}PqW|r|j |_d SnJ|d(krd }x(d.D] }|jj}||krd}PqW|r|j |_d S|jj ||jjt ddd|id d|jd<|j |_d S)/NrjFrr"zeof-in-doctype)r#r$rrTuUbBlLiIr@rsSrrrrrrmMz*expected-space-or-right-bracket-in-doctyper$)r#r$r.)rrrrrrrrrrr@r)rrrrr)rrrrrrrrrrrr)rrrrr)rr6rr%r7rrrrr=rafterDoctypePublicKeywordStateafterDoctypeSystemKeywordStatebogusDoctypeState)rr$rrr r r!rOsT              z#HTMLTokenizer.afterDoctypeNameStatecCs|jj}|tkr|j|_n|d krP|jjtddd|jj||j|_nT|t kr|jjtdddd|j d<|jj|j |j |_n|jj||j|_d S) Nrrr"zunexpected-char-in-doctype)r#r$zeof-in-doctypeFrT)rr) rr6r"beforeDoctypePublicIdentifierStaterr%r7rr=rrr)rr$r r r!rs"           z,HTMLTokenizer.afterDoctypePublicKeywordStatecCs|jj}|tkrn|dkr0d|jd<|j|_n|dkrLd|jd<|j|_n|dkr|jjt dddd |jd <|jj|j|j |_nh|t kr|jjt dd dd |jd <|jj|j|j |_n(|jjt dd dd |jd <|j |_d S)Nrr,rrrjr"zunexpected-end-of-doctype)r#r$Frzeof-in-doctypezunexpected-char-in-doctypeT) rr6rr(doctypePublicIdentifierDoubleQuotedStater(doctypePublicIdentifierSingleQuotedStater%r7rrrr)rr$r r r!rs4                z0HTMLTokenizer.beforeDoctypePublicIdentifierStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<n|dkr|jjtdd dd |jd <|jj|j|j|_nR|t kr|jjtdd dd |jd <|jj|j|j|_n|jd|7<d S)Nrr[r"zinvalid-codepoint)r#r$ru�rjzunexpected-end-of-doctypeFrzeof-in-doctypeT) rr6!afterDoctypePublicIdentifierStaterr%r7rrrr)rr$r r r!rs*            z6HTMLTokenizer.doctypePublicIdentifierDoubleQuotedStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<n|dkr|jjtdd dd |jd <|jj|j|j|_nR|t kr|jjtdd dd |jd <|jj|j|j|_n|jd|7<d S)Nrr[r"zinvalid-codepoint)r#r$ru�rjzunexpected-end-of-doctypeFrzeof-in-doctypeT) rr6rrr%r7rrrr)rr$r r r!rs*            z6HTMLTokenizer.doctypePublicIdentifierSingleQuotedStatecCs |jj}|tkr|j|_n|dkr<|jj|j|j|_n|dkrn|jjt dddd|jd<|j |_n|dkr|jjt dddd|jd<|j |_nh|t kr|jjt dd dd |jd <|jj|j|j|_n(|jjt dddd |jd <|j |_d S) Nrjrr"zunexpected-char-in-doctype)r#r$r,rrzeof-in-doctypeFrT)rr6r-betweenDoctypePublicAndSystemIdentifiersStaterr%r7rrr(doctypeSystemIdentifierDoubleQuotedState(doctypeSystemIdentifierSingleQuotedStaterr)rr$r r r!rs6                  z/HTMLTokenizer.afterDoctypePublicIdentifierStatecCs|jj}|tkrn|dkr4|jj|j|j|_n|dkrPd|jd<|j|_n|dkrld|jd<|j |_nh|t kr|jjt dddd |jd <|jj|j|j|_n(|jjt dd dd |jd <|j |_d S) Nrjrr,rrr"zeof-in-doctype)r#r$Frzunexpected-char-in-doctypeT) rr6rr%r7rrrrrrrr)rr$r r r!rs.             z;HTMLTokenizer.betweenDoctypePublicAndSystemIdentifiersStatecCs|jj}|tkr|j|_n|d krP|jjtddd|jj||j|_nT|t kr|jjtdddd|j d<|jj|j |j |_n|jj||j|_d S) Nrrr"zunexpected-char-in-doctype)r#r$zeof-in-doctypeFrT)rr) rr6r"beforeDoctypeSystemIdentifierStaterr%r7rr=rrr)rr$r r r!rs"           z,HTMLTokenizer.afterDoctypeSystemKeywordStatecCs|jj}|tkrn|dkr0d|jd<|j|_n|dkrLd|jd<|j|_n|dkr|jjt dddd |jd <|jj|j|j |_nh|t kr|jjt dd dd |jd <|jj|j|j |_n(|jjt dddd |jd <|j |_d S) Nrr,rrrjr"zunexpected-char-in-doctype)r#r$Frzeof-in-doctypeT) rr6rrrrrr%r7rrrr)rr$r r r!r/s4                z0HTMLTokenizer.beforeDoctypeSystemIdentifierStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<n|dkr|jjtdd dd |jd <|jj|j|j|_nR|t kr|jjtdd dd |jd <|jj|j|j|_n|jd|7<d S)Nrr[r"zinvalid-codepoint)r#r$ru�rjzunexpected-end-of-doctypeFrzeof-in-doctypeT) rr6!afterDoctypeSystemIdentifierStaterr%r7rrrr)rr$r r r!rLs*            z6HTMLTokenizer.doctypeSystemIdentifierDoubleQuotedStatecCs|jj}|dkr|j|_n|dkrN|jjtddd|jdd7<n|dkr|jjtdd dd |jd <|jj|j|j|_nR|t kr|jjtdd dd |jd <|jj|j|j|_n|jd|7<d S)Nrr[r"zinvalid-codepoint)r#r$ru�rjzunexpected-end-of-doctypeFrzeof-in-doctypeT) rr6rrr%r7rrrr)rr$r r r!rds*            z6HTMLTokenizer.doctypeSystemIdentifierSingleQuotedStatecCs|jj}|tkrn~|dkr4|jj|j|j|_n^|tkrt|jjt dddd|jd<|jj|j|j|_n|jjt ddd|j |_dS) Nrjr"zeof-in-doctype)r#r$Frzunexpected-char-in-doctypeT) rr6rr%r7rrrrrr)rr$r r r!r|s         z/HTMLTokenizer.afterDoctypeSystemIdentifierStatecCsZ|jj}|dkr*|jj|j|j|_n,|tkrV|jj||jj|j|j|_ndS)NrjT) rr6r%r7rrrrr=)rr$r r r!rs    zHTMLTokenizer.bogusDoctypeStatecCsg}x|j|jjd|j|jjd|jj}|tkr@Pq|dksLt|ddddkrx|ddd|d<Pq|j|qWdj|}|jd}|dkrx&t|D]}|j jt d d d qW|j dd }|r|j jt d |d |j |_ dS)N]rjrz]]r,r[rr"zinvalid-codepoint)r#r$u�rJTrKrKrrK)r7rr^r6rAssertionErrorr9countranger%rrrr)rr$r6Z nullCountrr r r!rs0        zHTMLTokenizer.cdataSectionState)N)NF)N__name__ __module__ __qualname____doc__rr)rBrSrTrZrr\rbr`rdrfrgr]rmrnrarsrtrcrwrxreryr{rzr}rrr~rrrrrrrrrrprrrrrrrrqrorlrrrrrrrrrrrrrrrrrrrrrrr __classcell__r r )rr!rs H P#         6 "-3rN)Z __future__rrrZpip._vendor.sixrr; collectionsrZ constantsrr r r r r rrrrZ _inputstreamrZ_trierrLobjectrr r r r!s