From 49e122523e69564d4cd12ae0f604214e4889203a Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 3 Feb 2021 10:32:24 +0700 Subject: [PATCH] improved sitemap parsing --- README.md | 5 +++-- src/second_stage.py | 54 ++++++++++++++++++++++++++++----------------- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 4a739cd..f666253 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ The downside, additional to the limitations above is, that depending on the amou ## Installation -Install required none standard Python libraries: +Install required none standard Python libraries: **requests** to make the HTTP calls, [link](https://pypi.org/project/requests/) * On Arch: `sudo pacman -S python-requests` * Via Pip: `pip install requests` @@ -72,8 +72,9 @@ Copy or rename the file *config.sample* to *config* and make sure you set all th The config file supports the following settings: * *start_url* : Fully qualified URL of the home page of the website to parse. Add *www* if your canonical website uses it to avoid landing in a redirect for every request. * example: `https://www.example.com/` -* *sitemap_url* : Link to the sitemap, so pages not linked anywhere but indexed can get parsed too. +* *sitemap_url* : Link to the sitemap, so pages not linked anywhere but indexed can get parsed too. Link can be direct link to your sitemap or link to a list of sitemaps. * example: `https://www.example.com/sitemap_index.xml` + * example: `https://www.example.com/sitemap.xml` * *upload_folder* : Wordpress upload folder where the media library builds the folder tree. * example: `https://www.example.com/wp-content/uploads/` for a default wordpress installation. * *valid_img_mime* : A comma separated list of image [MIME types](https://www.iana.org/assignments/media-types/media-types.xhtml#image) you want to consider as a image to check for its existence. An easy way to exclude files like PDFs or other media files. diff --git a/src/second_stage.py b/src/second_stage.py index 248a54e..ddcc558 100644 --- a/src/second_stage.py +++ b/src/second_stage.py @@ -12,7 +12,7 @@ def discover_sitemap(config, discovered): sitemap_url = config['sitemap_url'] request_timeout = config['request_timeout'] # get main - print("look at sitemap") + print("parsing sitemap") try: response = requests.get(sitemap_url) except ConnectionError: @@ -20,29 +20,43 @@ def discover_sitemap(config, discovered): response = requests.get(sitemap_url) xml = response.text soup = BeautifulSoup(xml, features="lxml") - sitemap_tags = soup.find_all("sitemap") - sitemap_list = [map.findNext("loc").text for map in sitemap_tags] - # loop through all list and get map by map - all_sitemap_pages = [] - for sitemap in sitemap_list: - try: - response = requests.get(sitemap) - except ConnectionError: - sleep(request_timeout) - response = requests.get(sitemap) - xml = response.text - soup = BeautifulSoup(xml, features="lxml") - page_tags = soup.find_all("url") - page_list = [map.findNext("loc").text for map in page_tags] - # add every page to list - for page in page_list: - all_sitemap_pages.append(page) - # sort and return - all_sitemap_pages.sort() + # build list of sites + all_sitemap_pages = parse_sitemap(soup, request_timeout) # add to discovered list if new discovered = [discovered.append(page) for page in all_sitemap_pages if page not in discovered] +def parse_sitemap(soup, request_timeout): + """ called from discover_sitemap to build the site list + figure out if its a single sitemap or a list of sitemaps """ + sitemap_tags = soup.find_all("sitemap") + if len(sitemap_tags) == 0: + # is already the sitemap + page_tags = soup.find_all("url") + all_sitemap_pages = [map.findNext("loc").text for map in page_tags] + elif len(sitemap_tags) > 0: + # is a list of sitemaps to loop through + all_sitemap_pages = [] + sitemap_list = [map.findNext("loc").text for map in sitemap_tags] + for sitemap in sitemap_list: + try: + response = requests.get(sitemap) + except ConnectionError: + sleep(request_timeout) + response = requests.get(sitemap) + xml = response.text + soup = BeautifulSoup(xml, features="lxml") + page_tags = soup.find_all("url") + page_list = [map.findNext("loc").text for map in page_tags] + # add every page to list + for page in page_list: + all_sitemap_pages.append(page) + # take it easy + sleep(request_timeout) + # sort and return + all_sitemap_pages.sort() + return all_sitemap_pages + def get_media_lib(config): """ returns a list of dics of media files in library """