From 631dade746a4a5b550dad518f218f8090602c8f9 Mon Sep 17 00:00:00 2001
From: Roman Hergenreder <mail@romanh.de>
Date: Mon, 24 Jan 2022 20:01:49 +0100
Subject: [PATCH] only parse html files

---
 crawl_urls.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 crawl_urls.py

diff --git a/crawl_urls.py b/crawl_urls.py
old mode 100644
new mode 100755
index d47a061..1f31cfa
--- a/crawl_urls.py
+++ b/crawl_urls.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 import argparse
 import urllib.parse
 import urllib3
@@ -55,6 +57,10 @@ class Crawler:
 
             self.visited.add(url)
             res = self.request(url)
+            content_type = res.headers.get("Content-Type", None)
+            if "text/html" not in content_type.lower().split(";"):
+                continue
+
             urls = self.collect_urls(res.text)
             for url in urls:
                 parts = urllib.parse.urlparse(url)
@@ -124,4 +130,4 @@ if __name__ == "__main__":
 
     for name, values in results.items():
         print(f"=== {name} ===")
-        print("\n".join(values))
\ No newline at end of file
+        print("\n".join(values))