Skip to content

Commit 3f8ab96

Browse files
committed
Fix Issue6325 - robotparse to honor urls with query strings.
1 parent 96a60ae commit 3f8ab96

2 files changed

Lines changed: 15 additions & 2 deletions

File tree

Lib/test/test_robotparser.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,17 @@ def RobotTest(index, robots_txt, good_urls, bad_urls,
205205
RobotTest(13, doc, good, bad, agent="googlebot")
206206

207207

208+
# 14. For issue #6325 (query string support)
209+
doc = """
210+
User-agent: *
211+
Disallow: /some/path?name=value
212+
"""
213+
214+
good = ['/some/path']
215+
bad = ['/some/path?name=value']
216+
217+
RobotTest(14, doc, good, bad)
218+
208219

209220
class NetworkTestCase(unittest.TestCase):
210221

Lib/urllib/robotparser.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,10 @@ def can_fetch(self, useragent, url):
129129
return True
130130
# search for given user agent matches
131131
# the first match counts
132-
url = urllib.parse.quote(
133-
urllib.parse.urlparse(urllib.parse.unquote(url))[2])
132+
parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
133+
url = urllib.parse.urlunparse(('','',parsed_url.path,
134+
parsed_url.params,parsed_url.query, parsed_url.fragment))
135+
url = urllib.parse.quote(url)
134136
if not url:
135137
url = "/"
136138
for entry in self.entries:

0 commit comments

Comments
 (0)