From 49e122523e69564d4cd12ae0f604214e4889203a Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Wed, 3 Feb 2021 10:32:24 +0700
Subject: [PATCH] improved sitemap parsing

---
 README.md           |  5 +++--
 src/second_stage.py | 54 ++++++++++++++++++++++++++++-----------------
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 4a739cd..f666253 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ The downside, additional to the limitations above is, that depending on the amou
 
 
 ## Installation
-Install required none standard Python libraries:
+Install required none standard Python libraries:  
 **requests** to make the HTTP calls, [link](https://pypi.org/project/requests/)
 * On Arch: `sudo pacman -S python-requests`
 * Via Pip: `pip install requests`
@@ -72,8 +72,9 @@ Copy or rename the file *config.sample* to *config* and make sure you set all th
 The config file supports the following settings:
 * *start_url*       : Fully qualified URL of the home page of the website to parse. Add *www* if your canonical website uses it to avoid landing in a redirect for every request.
     * example: `https://www.example.com/`
-* *sitemap_url*     : Link to the sitemap, so pages not linked anywhere but indexed can get parsed too.
+* *sitemap_url*     : Link to the sitemap, so pages not linked anywhere but indexed can get parsed too. Link can be direct link to your sitemap or link to a list of sitemaps.
     * example: `https://www.example.com/sitemap_index.xml`
+    * example: `https://www.example.com/sitemap.xml`
 * *upload_folder*   : Wordpress upload folder where the media library builds the folder tree.
     * example: `https://www.example.com/wp-content/uploads/` for a default wordpress installation.
 * *valid_img_mime*  : A comma separated list of image [MIME types](https://www.iana.org/assignments/media-types/media-types.xhtml#image) you want to consider as a image to check for its existence. An easy way to exclude files like PDFs or other media files.
diff --git a/src/second_stage.py b/src/second_stage.py
index 248a54e..ddcc558 100644
--- a/src/second_stage.py
+++ b/src/second_stage.py
@@ -12,7 +12,7 @@ def discover_sitemap(config, discovered):
     sitemap_url = config['sitemap_url']
     request_timeout = config['request_timeout']
     # get main
-    print("look at sitemap")
+    print("parsing sitemap")
     try:
         response = requests.get(sitemap_url)
     except ConnectionError:
@@ -20,29 +20,43 @@ def discover_sitemap(config, discovered):
         response = requests.get(sitemap_url)
     xml = response.text
     soup = BeautifulSoup(xml, features="lxml")
-    sitemap_tags = soup.find_all("sitemap")
-    sitemap_list = [map.findNext("loc").text for map in sitemap_tags]
-    # loop through all list and get map by map
-    all_sitemap_pages = []
-    for sitemap in sitemap_list:
-        try:
-            response = requests.get(sitemap)
-        except ConnectionError:
-            sleep(request_timeout)
-            response = requests.get(sitemap)
-        xml = response.text
-        soup = BeautifulSoup(xml, features="lxml")
-        page_tags = soup.find_all("url")
-        page_list = [map.findNext("loc").text for map in page_tags]
-        # add every page to list
-        for page in page_list:
-            all_sitemap_pages.append(page)
-    # sort and return
-    all_sitemap_pages.sort()
+    # build list of sites
+    all_sitemap_pages = parse_sitemap(soup, request_timeout)
     # add to discovered list if new
     discovered = [discovered.append(page) for page in all_sitemap_pages if page not in discovered]
 
 
+def parse_sitemap(soup, request_timeout):
+    """ called from discover_sitemap to build the site list
+    figure out if its a single sitemap or a list of sitemaps """
+    sitemap_tags = soup.find_all("sitemap")
+    if len(sitemap_tags) == 0:
+        # is already the sitemap
+        page_tags = soup.find_all("url")
+        all_sitemap_pages = [map.findNext("loc").text for map in page_tags]
+    elif len(sitemap_tags) > 0:
+        # is a list of sitemaps to loop through
+        all_sitemap_pages = []
+        sitemap_list = [map.findNext("loc").text for map in sitemap_tags]
+        for sitemap in sitemap_list:
+            try:
+                response = requests.get(sitemap)
+            except ConnectionError:
+                sleep(request_timeout)
+                response = requests.get(sitemap)
+            xml = response.text
+            soup = BeautifulSoup(xml, features="lxml")
+            page_tags = soup.find_all("url")
+            page_list = [map.findNext("loc").text for map in page_tags]
+            # add every page to list
+            for page in page_list:
+                all_sitemap_pages.append(page)
+            # take it easy
+            sleep(request_timeout)
+    # sort and return
+    all_sitemap_pages.sort()
+    return all_sitemap_pages
+
 
 def get_media_lib(config):
     """ returns a list of dics of media files in library """