only parse html files

This commit is contained in:
Roman Hergenreder 2022-01-24 20:01:49 +01:00
parent 8adc30d7ae
commit 631dade746

6
crawl_urls.py Normal file → Executable file

@ -1,3 +1,5 @@
#!/usr/bin/env python3
import argparse import argparse
import urllib.parse import urllib.parse
import urllib3 import urllib3
@ -55,6 +57,10 @@ class Crawler:
self.visited.add(url) self.visited.add(url)
res = self.request(url) res = self.request(url)
content_type = res.headers.get("Content-Type", None)
if "text/html" not in content_type.lower().split(";"):
continue
urls = self.collect_urls(res.text) urls = self.collect_urls(res.text)
for url in urls: for url in urls:
parts = urllib.parse.urlparse(url) parts = urllib.parse.urlparse(url)