diff --git a/crawl_urls.py b/crawl_urls.py old mode 100644 new mode 100755 index d47a061..1f31cfa --- a/crawl_urls.py +++ b/crawl_urls.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import argparse import urllib.parse import urllib3 @@ -55,6 +57,10 @@ class Crawler: self.visited.add(url) res = self.request(url) + content_type = res.headers.get("Content-Type", None) + if "text/html" not in content_type.lower().split(";"): + continue + urls = self.collect_urls(res.text) for url in urls: parts = urllib.parse.urlparse(url) @@ -124,4 +130,4 @@ if __name__ == "__main__": for name, values in results.items(): print(f"=== {name} ===") - print("\n".join(values)) \ No newline at end of file + print("\n".join(values))