diff --git a/spider.py b/spider.py index 8e1245c..2c227a1 100755 --- a/spider.py +++ b/spider.py @@ -2,7 +2,9 @@ """ spiderman """ import configparser +import os from time import sleep +from IPython.display import display, HTML import requests from bs4 import BeautifulSoup @@ -64,9 +66,24 @@ def main(): def page_processing(discovered, indexed, main_img_list, main_href_list, connectivity_cache, config): """ start the main loop to discover new pages """ request_timeout = config['request_timeout'] + # status + title = [] for page in discovered: if page not in indexed: - print(f'parsing [{len(indexed)}]/[{len(discovered)}] {page}') + # progress + message = f'parsing [{len(indexed)}]/[{len(discovered)}] {page}' + if len(title) > 4: + title.pop(0) + title.append(message) + try: + # will fail if not in jupyter notebook + get_ipython + clear_output(wait=True) + except: + # will work in shell + os.system('clear') + print('\n'.join(title)) + # parse img_list, href_list = parse_page(page, connectivity_cache, config) for img in img_list: main_img_list.append(img)