improved sitemap parsing
This commit is contained in:
parent
9321aa35fb
commit
49e122523e
|
@ -33,7 +33,7 @@ The downside, additional to the limitations above is, that depending on the amou
|
||||||
|
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
Install required none standard Python libraries:
|
Install required none standard Python libraries:
|
||||||
**requests** to make the HTTP calls, [link](https://pypi.org/project/requests/)
|
**requests** to make the HTTP calls, [link](https://pypi.org/project/requests/)
|
||||||
* On Arch: `sudo pacman -S python-requests`
|
* On Arch: `sudo pacman -S python-requests`
|
||||||
* Via Pip: `pip install requests`
|
* Via Pip: `pip install requests`
|
||||||
|
@ -72,8 +72,9 @@ Copy or rename the file *config.sample* to *config* and make sure you set all th
|
||||||
The config file supports the following settings:
|
The config file supports the following settings:
|
||||||
* *start_url* : Fully qualified URL of the home page of the website to parse. Add *www* if your canonical website uses it to avoid landing in a redirect for every request.
|
* *start_url* : Fully qualified URL of the home page of the website to parse. Add *www* if your canonical website uses it to avoid landing in a redirect for every request.
|
||||||
* example: `https://www.example.com/`
|
* example: `https://www.example.com/`
|
||||||
* *sitemap_url* : Link to the sitemap, so pages not linked anywhere but indexed can get parsed too.
|
* *sitemap_url* : Link to the sitemap, so pages not linked anywhere but indexed can get parsed too. Link can be direct link to your sitemap or link to a list of sitemaps.
|
||||||
* example: `https://www.example.com/sitemap_index.xml`
|
* example: `https://www.example.com/sitemap_index.xml`
|
||||||
|
* example: `https://www.example.com/sitemap.xml`
|
||||||
* *upload_folder* : Wordpress upload folder where the media library builds the folder tree.
|
* *upload_folder* : Wordpress upload folder where the media library builds the folder tree.
|
||||||
* example: `https://www.example.com/wp-content/uploads/` for a default wordpress installation.
|
* example: `https://www.example.com/wp-content/uploads/` for a default wordpress installation.
|
||||||
* *valid_img_mime* : A comma separated list of image [MIME types](https://www.iana.org/assignments/media-types/media-types.xhtml#image) you want to consider as a image to check for its existence. An easy way to exclude files like PDFs or other media files.
|
* *valid_img_mime* : A comma separated list of image [MIME types](https://www.iana.org/assignments/media-types/media-types.xhtml#image) you want to consider as a image to check for its existence. An easy way to exclude files like PDFs or other media files.
|
||||||
|
|
|
@ -12,7 +12,7 @@ def discover_sitemap(config, discovered):
|
||||||
sitemap_url = config['sitemap_url']
|
sitemap_url = config['sitemap_url']
|
||||||
request_timeout = config['request_timeout']
|
request_timeout = config['request_timeout']
|
||||||
# get main
|
# get main
|
||||||
print("look at sitemap")
|
print("parsing sitemap")
|
||||||
try:
|
try:
|
||||||
response = requests.get(sitemap_url)
|
response = requests.get(sitemap_url)
|
||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
|
@ -20,29 +20,43 @@ def discover_sitemap(config, discovered):
|
||||||
response = requests.get(sitemap_url)
|
response = requests.get(sitemap_url)
|
||||||
xml = response.text
|
xml = response.text
|
||||||
soup = BeautifulSoup(xml, features="lxml")
|
soup = BeautifulSoup(xml, features="lxml")
|
||||||
sitemap_tags = soup.find_all("sitemap")
|
# build list of sites
|
||||||
sitemap_list = [map.findNext("loc").text for map in sitemap_tags]
|
all_sitemap_pages = parse_sitemap(soup, request_timeout)
|
||||||
# loop through all list and get map by map
|
|
||||||
all_sitemap_pages = []
|
|
||||||
for sitemap in sitemap_list:
|
|
||||||
try:
|
|
||||||
response = requests.get(sitemap)
|
|
||||||
except ConnectionError:
|
|
||||||
sleep(request_timeout)
|
|
||||||
response = requests.get(sitemap)
|
|
||||||
xml = response.text
|
|
||||||
soup = BeautifulSoup(xml, features="lxml")
|
|
||||||
page_tags = soup.find_all("url")
|
|
||||||
page_list = [map.findNext("loc").text for map in page_tags]
|
|
||||||
# add every page to list
|
|
||||||
for page in page_list:
|
|
||||||
all_sitemap_pages.append(page)
|
|
||||||
# sort and return
|
|
||||||
all_sitemap_pages.sort()
|
|
||||||
# add to discovered list if new
|
# add to discovered list if new
|
||||||
discovered = [discovered.append(page) for page in all_sitemap_pages if page not in discovered]
|
discovered = [discovered.append(page) for page in all_sitemap_pages if page not in discovered]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_sitemap(soup, request_timeout):
|
||||||
|
""" called from discover_sitemap to build the site list
|
||||||
|
figure out if its a single sitemap or a list of sitemaps """
|
||||||
|
sitemap_tags = soup.find_all("sitemap")
|
||||||
|
if len(sitemap_tags) == 0:
|
||||||
|
# is already the sitemap
|
||||||
|
page_tags = soup.find_all("url")
|
||||||
|
all_sitemap_pages = [map.findNext("loc").text for map in page_tags]
|
||||||
|
elif len(sitemap_tags) > 0:
|
||||||
|
# is a list of sitemaps to loop through
|
||||||
|
all_sitemap_pages = []
|
||||||
|
sitemap_list = [map.findNext("loc").text for map in sitemap_tags]
|
||||||
|
for sitemap in sitemap_list:
|
||||||
|
try:
|
||||||
|
response = requests.get(sitemap)
|
||||||
|
except ConnectionError:
|
||||||
|
sleep(request_timeout)
|
||||||
|
response = requests.get(sitemap)
|
||||||
|
xml = response.text
|
||||||
|
soup = BeautifulSoup(xml, features="lxml")
|
||||||
|
page_tags = soup.find_all("url")
|
||||||
|
page_list = [map.findNext("loc").text for map in page_tags]
|
||||||
|
# add every page to list
|
||||||
|
for page in page_list:
|
||||||
|
all_sitemap_pages.append(page)
|
||||||
|
# take it easy
|
||||||
|
sleep(request_timeout)
|
||||||
|
# sort and return
|
||||||
|
all_sitemap_pages.sort()
|
||||||
|
return all_sitemap_pages
|
||||||
|
|
||||||
|
|
||||||
def get_media_lib(config):
|
def get_media_lib(config):
|
||||||
""" returns a list of dics of media files in library """
|
""" returns a list of dics of media files in library """
|
||||||
|
|
Loading…
Reference in New Issue