adapting to run in jupyter notebook

2021-03-24 14:14:45 +07:00 · 2021-03-24 14:14:45 +07:00 · d456a4ab28
parent 7f6953d6c4
commit d456a4ab28
1 changed files with 18 additions and 1 deletions
--- a/spider.py
+++ b/spider.py
@ -2,7 +2,9 @@
 """ spiderman """
 import configparser
 import os
 from time import sleep
 from IPython.display import display, HTML
 import requests
 from bs4 import BeautifulSoup
@ -64,9 +66,24 @@ def main():
 def page_processing(discovered, indexed, main_img_list, main_href_list, connectivity_cache, config):
    """ start the main loop to discover new pages """
    request_timeout = config['request_timeout']
    # status
    title = []
    for page in discovered:
        if page not in indexed:
-            print(f'parsing [{len(indexed)}]/[{len(discovered)}] {page}')
+            # progress
            message = f'parsing [{len(indexed)}]/[{len(discovered)}] {page}'
            if len(title) > 4:
                title.pop(0)
            title.append(message)
            try:
                # will fail if not in jupyter notebook
                get_ipython
                clear_output(wait=True)
            except:
                # will work in shell
                os.system('clear')
            print('\n'.join(title))
            # parse
            img_list, href_list = parse_page(page, connectivity_cache, config)
            for img in img_list:
                main_img_list.append(img)