only parse html files
This commit is contained in:
parent
8adc30d7ae
commit
631dade746
6
crawl_urls.py
Normal file → Executable file
6
crawl_urls.py
Normal file → Executable file
@ -1,3 +1,5 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import urllib3
|
import urllib3
|
||||||
@ -55,6 +57,10 @@ class Crawler:
|
|||||||
|
|
||||||
self.visited.add(url)
|
self.visited.add(url)
|
||||||
res = self.request(url)
|
res = self.request(url)
|
||||||
|
content_type = res.headers.get("Content-Type", None)
|
||||||
|
if "text/html" not in content_type.lower().split(";"):
|
||||||
|
continue
|
||||||
|
|
||||||
urls = self.collect_urls(res.text)
|
urls = self.collect_urls(res.text)
|
||||||
for url in urls:
|
for url in urls:
|
||||||
parts = urllib.parse.urlparse(url)
|
parts = urllib.parse.urlparse(url)
|
||||||
|
Loading…
Reference in New Issue
Block a user