wp-spider/src/parse_html.py

85 lines
2.9 KiB
Python

""" parses and processes the html for each page """
from time import sleep
import requests
def get_hrefs(soup, home_pass=True):
""" takes the soup and returns all href found
excludes # links and links to jpg files """
url_set = set()
# loop through soup
all_links = soup.find_all("a")
for link in all_links:
try:
url = link["href"]
except KeyError:
continue
if url.startswith('http') and not url.endswith('#') and not url.lower().endswith('.jpg'):
url_set.add(url)
href_url_list = list(url_set)
# remove top nav items if not homepage
if home_pass:
# split soop
soup_nav = soup.find("nav", {"role": "navigation"})
soup_footer = soup.find("div", {"class": "elementor-location-footer"})
# get links links
try:
all_nav_links = list(set([x["href"] for x in soup_nav.find_all("a")]))
all_footer_links = list(set([x["href"] for x in soup_footer.find_all("a")]))
href_url_list = [link for link in href_url_list if link not in all_nav_links]
href_url_list = [link for link in href_url_list if link not in all_footer_links]
except:
pass
href_url_list.sort()
return href_url_list
def get_images(soup, config):
""" takes the soup and returns all images from
img html tags, inline css and external CSS files """
upload_folder = config['upload_folder']
request_timeout = config['request_timeout']
img_url_set = set()
# from img tag
all_imgs = soup.find_all("img")
for img in all_imgs:
url = img["src"]
if upload_folder in url:
img_url_set.add(url)
# from inline style tag
all_divs = soup.find_all('div')
for div in all_divs:
try:
style = div["style"]
if 'background-image' in style:
url = style.split('(')[1].split(')')[0]
img_url_set.add(url)
except:
continue
# gallery item from a tag
all_a_tag = soup.find_all('a')
for a_tag in all_a_tag:
try:
a_href = a_tag['href']
if upload_folder in a_href:
img_url_set.add(a_href)
except KeyError:
continue
# external
all_external_css = soup.find_all("link", {"rel": "stylesheet"})
for css_file in all_external_css:
remote_file = css_file["href"]
try:
remote_css = requests.get(remote_file).text
except ConnectionError:
sleep(request_timeout)
remote_css = requests.get(remote_file).text
css_rules = remote_css.split(';')
for rule in css_rules:
if upload_folder in rule:
url = rule.split('(')[1].split(')')[0]
img_url_set.add(url)
img_url_list = list(img_url_set)
img_url_list.sort()
return img_url_list