adapting to run in jupyter notebook

This commit is contained in:
simon 2021-03-24 14:14:45 +07:00
parent 7f6953d6c4
commit d456a4ab28
1 changed files with 18 additions and 1 deletions

View File

@ -2,7 +2,9 @@
""" spiderman """ """ spiderman """
import configparser import configparser
import os
from time import sleep from time import sleep
from IPython.display import display, HTML
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -64,9 +66,24 @@ def main():
def page_processing(discovered, indexed, main_img_list, main_href_list, connectivity_cache, config): def page_processing(discovered, indexed, main_img_list, main_href_list, connectivity_cache, config):
""" start the main loop to discover new pages """ """ start the main loop to discover new pages """
request_timeout = config['request_timeout'] request_timeout = config['request_timeout']
# status
title = []
for page in discovered: for page in discovered:
if page not in indexed: if page not in indexed:
print(f'parsing [{len(indexed)}]/[{len(discovered)}] {page}') # progress
message = f'parsing [{len(indexed)}]/[{len(discovered)}] {page}'
if len(title) > 4:
title.pop(0)
title.append(message)
try:
# will fail if not in jupyter notebook
get_ipython
clear_output(wait=True)
except:
# will work in shell
os.system('clear')
print('\n'.join(title))
# parse
img_list, href_list = parse_page(page, connectivity_cache, config) img_list, href_list = parse_page(page, connectivity_cache, config)
for img in img_list: for img in img_list:
main_img_list.append(img) main_img_list.append(img)