U e5dÐ$ã@s\dZddlZddlZddlZdgZe dd¡ZGdd„dƒZGdd„dƒZ Gd d „d ƒZ dS) a% robotparser.py Copyright (C) 2000 Bastian Kleineidam You can choose between two licenses when using this package: 1) GNU GPLv2 2) PSF license for Python 2.2 The robots.txt Exclusion Protocol is implemented as specified in http://www.robotstxt.org/norobots-rfc.txt éNÚRobotFileParserÚ RequestRatezrequests secondsc@sreZdZdZddd„Zdd„Zdd„Zd d „Zd d „Zd d„Z dd„Z dd„Z dd„Z dd„Z dd„Zdd„ZdS)rzs This class provides a set of methods to read, parse and answer questions about a single robots.txt file. ÚcCs2g|_g|_d|_d|_d|_| |¡d|_dS)NFr)ÚentriesÚsitemapsÚ default_entryÚ disallow_allÚ allow_allÚset_urlÚ last_checked©ÚselfÚurl©rú*/usr/lib64/python3.8/urllib/robotparser.pyÚ__init__s zRobotFileParser.__init__cCs|jS)z·Returns the time the robots.txt file was last fetched. This is useful for long-running web spiders that need to check for new robots.txt files periodically. )r ©r rrrÚmtime%szRobotFileParser.mtimecCsddl}| ¡|_dS)zYSets the time the robots.txt file was last fetched to the current time. rN)Útimer )r rrrrÚmodified.szRobotFileParser.modifiedcCs&||_tj |¡dd…\|_|_dS)z,Sets the URL referring to a robots.txt file.ééN)rÚurllibÚparseÚurlparseZhostÚpathr rrrr 6szRobotFileParser.set_urlc Cs†ztj |j¡}WnRtjjk rd}z0|jdkr:d|_n|jdkrT|jdkrTd|_W5d}~XYnX|  ¡}|  |  d¡  ¡¡dS)z4Reads the robots.txt URL and feeds it to the parser.)i‘i“TiiôNzutf-8) rZrequestZurlopenrÚerrorZ HTTPErrorÚcoderr ÚreadrÚdecodeÚ splitlines)r ÚfÚerrÚrawrrrr;s zRobotFileParser.readcCs,d|jkr|jdkr(||_n |j |¡dS©NÚ*)Ú useragentsrrÚappend)r ÚentryrrrÚ _add_entryHs  zRobotFileParser._add_entrycCsPd}tƒ}| ¡|D]}|sP|dkr4tƒ}d}n|dkrP| |¡tƒ}d}| d¡}|dkrn|d|…}| ¡}|s|q| dd¡}t|ƒdkr|d ¡ ¡|d<tj   |d ¡¡|d<|ddkrú|dkrä| |¡tƒ}|j   |d¡d}q|ddkr.|dkr6|j   t|dd ƒ¡d}q|dd krb|dkr6|j   t|dd ƒ¡d}q|dd kr |dkr6|d ¡ ¡ršt|dƒ|_d}q|dd kr|dkr6|d d¡}t|ƒdkr|d ¡ ¡r|d ¡ ¡rtt|dƒt|dƒƒ|_d}q|ddkr|j  |d¡q|dkrL| |¡dS)z”Parse the input lines from a robots.txt file. We allow that a user-agent: line is not preceded by one or more blank lines. rréú#Nú:z user-agentZdisallowFZallowTz crawl-delayz request-rateú/Zsitemap)ÚEntryrr)ÚfindÚstripÚsplitÚlenÚlowerrrÚunquoter&r'Ú rulelinesÚRuleLineÚisdigitÚintÚdelayrÚreq_rater)r ÚlinesÚstater(ÚlineÚiZnumbersrrrrQsj              ÿ  zRobotFileParser.parsecCs |jr dS|jrdS|jsdStj tj |¡¡}tj dd|j|j |j |j f¡}tj  |¡}|sfd}|j D]}| |¡rl| |¡Sql|jrœ|j |¡SdS)z=using the parsed robots.txt decide if useragent can fetch urlFTrr-)rr r rrrr4Ú urlunparserZparamsZqueryZfragmentÚquoterÚ applies_toÚ allowancer)r Ú useragentrZ parsed_urlr(rrrÚ can_fetchšs*ÿ    zRobotFileParser.can_fetchcCs>| ¡s dS|jD]}| |¡r|jSq|jr:|jjSdS©N)rrrAr9r©r rCr(rrrÚ crawl_delay·s   zRobotFileParser.crawl_delaycCs>| ¡s dS|jD]}| |¡r|jSq|jr:|jjSdSrE)rrrAr:rrFrrrÚ request_rateÁs   zRobotFileParser.request_ratecCs|js dS|jSrE)rrrrrÚ site_mapsËszRobotFileParser.site_mapscCs,|j}|jdk r||jg}d tt|ƒ¡S)Nz )rrÚjoinÚmapÚstr)r rrrrÚ__str__Ðs  zRobotFileParser.__str__N)r)Ú__name__Ú __module__Ú __qualname__Ú__doc__rrrr rr)rrDrGrHrIrMrrrrrs    I  c@s(eZdZdZdd„Zdd„Zdd„ZdS) r6zoA rule line is a single "Allow:" (allowance==True) or "Disallow:" (allowance==False) followed by a path.cCs<|dkr|sd}tj tj |¡¡}tj |¡|_||_dS)NrT)rrr?rr@rrB)r rrBrrrrÚs  zRuleLine.__init__cCs|jdkp| |j¡Sr$)rÚ startswith)r ÚfilenamerrrrAâszRuleLine.applies_tocCs|jr dndd|jS)NZAllowZDisallowz: )rBrrrrrrMåszRuleLine.__str__N)rNrOrPrQrrArMrrrrr6×sr6c@s0eZdZdZdd„Zdd„Zdd„Zdd „Zd S) r.z?An entry has one or more user-agents and zero or more rulelinescCsg|_g|_d|_d|_dSrE)r&r5r9r:rrrrrëszEntry.__init__cCs‚g}|jD]}| d|›¡q |jdk r<| d|j›¡|jdk rf|j}| d|j›d|j›¡| tt|j ƒ¡d  |¡S)Nz User-agent: z Crawl-delay: zRequest-rate: r-Ú ) r&r'r9r:ZrequestsZsecondsÚextendrKrLr5rJ)r ZretÚagentZraterrrrMñs   z Entry.__str__cCsF| d¡d ¡}|jD](}|dkr*dS| ¡}||krdSqdS)z2check if this entry applies to the specified agentr-rr%TF)r1r3r&)r rCrVrrrrAýs zEntry.applies_tocCs$|jD]}| |¡r|jSqdS)zZPreconditions: - our agent applies to this entry - filename is URL decodedT)r5rArB)r rSr=rrrrB s   zEntry.allowanceN)rNrOrPrQrrMrArBrrrrr.és   r.) rQÚ collectionsZ urllib.parserZurllib.requestÚ__all__Ú namedtuplerrr6r.rrrrÚs  B