only parse html files

2022-01-24 20:01:49 +01:00 · 2022-01-24 20:01:49 +01:00 · 631dade746
commit 631dade746
parent 8adc30d7ae
1 changed files with 7 additions and 1 deletions
--- a/crawl_urls.py
+++ b/crawl_urls.py
@ -1,3 +1,5 @@
 #!/usr/bin/env python3
 import argparse
 import urllib.parse
 import urllib3
@ -55,6 +57,10 @@ class Crawler:
            self.visited.add(url)
            res = self.request(url)
            content_type = res.headers.get("Content-Type", None)
            if "text/html" not in content_type.lower().split(";"):
                continue
            urls = self.collect_urls(res.text)
            for url in urls:
                parts = urllib.parse.urlparse(url)