Project Update

This commit is contained in:
2022-12-05 10:09:01 +01:00
parent a86fff1b36
commit 80abe85b85
11 changed files with 2722 additions and 1573 deletions

View File

@@ -79,14 +79,15 @@ class Crawler:
self.queue.put(parts._replace(netloc=self.domain, scheme=self.scheme,fragment="").geturl())
def collect_urls(self, page):
soup = BeautifulSoup(page, "html.parser")
if not isinstance(page, BeautifulSoup):
page = BeautifulSoup(page, "html.parser")
urls = set()
attrs = ["src","href","action"]
tags = ["a","link","script","img","form"]
for tag in tags:
for e in soup.find_all(tag):
for e in page.find_all(tag):
for attr in attrs:
if e.has_attr(attr):
urls.add(e[attr])