From 631dade746a4a5b550dad518f218f8090602c8f9 Mon Sep 17 00:00:00 2001 From: Roman Hergenreder Date: Mon, 24 Jan 2022 20:01:49 +0100 Subject: [PATCH] only parse html files --- crawl_urls.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) mode change 100644 => 100755 crawl_urls.py diff --git a/crawl_urls.py b/crawl_urls.py old mode 100644 new mode 100755 index d47a061..1f31cfa --- a/crawl_urls.py +++ b/crawl_urls.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import argparse import urllib.parse import urllib3 @@ -55,6 +57,10 @@ class Crawler: self.visited.add(url) res = self.request(url) + content_type = res.headers.get("Content-Type", None) + if "text/html" not in content_type.lower().split(";"): + continue + urls = self.collect_urls(res.text) for url in urls: parts = urllib.parse.urlparse(url) @@ -124,4 +130,4 @@ if __name__ == "__main__": for name, values in results.items(): print(f"=== {name} ===") - print("\n".join(values)) \ No newline at end of file + print("\n".join(values))