only parse html files

This commit is contained in:
Roman Hergenreder 2022-01-24 20:01:49 +01:00
parent 8adc30d7ae
commit 631dade746

8
crawl_urls.py Normal file → Executable file

@ -1,3 +1,5 @@
#!/usr/bin/env python3
import argparse import argparse
import urllib.parse import urllib.parse
import urllib3 import urllib3
@ -55,6 +57,10 @@ class Crawler:
self.visited.add(url) self.visited.add(url)
res = self.request(url) res = self.request(url)
content_type = res.headers.get("Content-Type", None)
if "text/html" not in content_type.lower().split(";"):
continue
urls = self.collect_urls(res.text) urls = self.collect_urls(res.text)
for url in urls: for url in urls:
parts = urllib.parse.urlparse(url) parts = urllib.parse.urlparse(url)
@ -124,4 +130,4 @@ if __name__ == "__main__":
for name, values in results.items(): for name, values in results.items():
print(f"=== {name} ===") print(f"=== {name} ===")
print("\n".join(values)) print("\n".join(values))