2021-01-31 07:39:33 +00:00
|
|
|
""" parses and processes the html for each page """
|
|
|
|
|
|
|
|
from time import sleep
|
|
|
|
import requests
|
|
|
|
|
|
|
|
def get_hrefs(soup, home_pass=True):
|
|
|
|
""" takes the soup and returns all href found
|
|
|
|
excludes # links and links to jpg files """
|
|
|
|
url_set = set()
|
|
|
|
# loop through soup
|
|
|
|
all_links = soup.find_all("a")
|
|
|
|
for link in all_links:
|
|
|
|
try:
|
|
|
|
url = link["href"]
|
|
|
|
except KeyError:
|
|
|
|
continue
|
|
|
|
if url.startswith('http') and not url.endswith('#') and not url.lower().endswith('.jpg'):
|
|
|
|
url_set.add(url)
|
|
|
|
href_url_list = list(url_set)
|
|
|
|
# remove top nav items if not homepage
|
|
|
|
if home_pass:
|
|
|
|
# split soop
|
|
|
|
soup_nav = soup.find("nav", {"role": "navigation"})
|
|
|
|
soup_footer = soup.find("div", {"class": "elementor-location-footer"})
|
|
|
|
# get links links
|
|
|
|
try:
|
|
|
|
all_nav_links = list(set([x["href"] for x in soup_nav.find_all("a")]))
|
|
|
|
all_footer_links = list(set([x["href"] for x in soup_footer.find_all("a")]))
|
|
|
|
href_url_list = [link for link in href_url_list if link not in all_nav_links]
|
|
|
|
href_url_list = [link for link in href_url_list if link not in all_footer_links]
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
href_url_list.sort()
|
|
|
|
return href_url_list
|
|
|
|
|
|
|
|
|
|
|
|
def get_images(soup, config):
|
|
|
|
""" takes the soup and returns all images from
|
|
|
|
img html tags, inline css and external CSS files """
|
|
|
|
upload_folder = config['upload_folder']
|
|
|
|
request_timeout = config['request_timeout']
|
|
|
|
img_url_set = set()
|
|
|
|
# from img tag
|
|
|
|
all_imgs = soup.find_all("img")
|
|
|
|
for img in all_imgs:
|
|
|
|
url = img["src"]
|
|
|
|
if upload_folder in url:
|
|
|
|
img_url_set.add(url)
|
|
|
|
# from inline style tag
|
|
|
|
all_divs = soup.find_all('div')
|
|
|
|
for div in all_divs:
|
|
|
|
try:
|
|
|
|
style = div["style"]
|
|
|
|
if 'background-image' in style:
|
|
|
|
url = style.split('(')[1].split(')')[0]
|
|
|
|
img_url_set.add(url)
|
|
|
|
except:
|
|
|
|
continue
|
2021-03-29 05:02:07 +00:00
|
|
|
# gallery item from a tag
|
|
|
|
all_a_tag = soup.find_all('a')
|
|
|
|
for a_tag in all_a_tag:
|
2021-03-29 06:20:43 +00:00
|
|
|
try:
|
|
|
|
a_href = a_tag['href']
|
|
|
|
if upload_folder in a_href:
|
|
|
|
img_url_set.add(a_href)
|
|
|
|
except KeyError:
|
|
|
|
continue
|
2021-01-31 07:39:33 +00:00
|
|
|
# external
|
|
|
|
all_external_css = soup.find_all("link", {"rel": "stylesheet"})
|
|
|
|
for css_file in all_external_css:
|
2021-03-29 05:02:07 +00:00
|
|
|
remote_file = css_file["href"]
|
2021-01-31 07:39:33 +00:00
|
|
|
try:
|
|
|
|
remote_css = requests.get(remote_file).text
|
|
|
|
except ConnectionError:
|
|
|
|
sleep(request_timeout)
|
|
|
|
remote_css = requests.get(remote_file).text
|
|
|
|
css_rules = remote_css.split(';')
|
|
|
|
for rule in css_rules:
|
|
|
|
if upload_folder in rule:
|
|
|
|
url = rule.split('(')[1].split(')')[0]
|
|
|
|
img_url_set.add(url)
|
|
|
|
img_url_list = list(img_url_set)
|
|
|
|
img_url_list.sort()
|
|
|
|
return img_url_list
|