adapting to run in jupyter notebook

2021-03-24 14:14:45 +07:00 · 2021-03-24 14:14:45 +07:00 · d456a4ab28
parent 7f6953d6c4
commit d456a4ab28
1 changed files with 18 additions and 1 deletions
--- a/spider.py
+++ b/spider.py
@ -2,7 +2,9 @@
 """ spiderman """

 import configparser
+import os
 from time import sleep
+from IPython.display import display, HTML

 import requests
 from bs4 import BeautifulSoup
@ -64,9 +66,24 @@ def main():
 def page_processing(discovered, indexed, main_img_list, main_href_list, connectivity_cache, config):
    """ start the main loop to discover new pages """
    request_timeout = config['request_timeout']
+    # status
+    title = []
    for page in discovered:
        if page not in indexed:
-            print(f'parsing [{len(indexed)}]/[{len(discovered)}] {page}')
+            # progress
+            message = f'parsing [{len(indexed)}]/[{len(discovered)}] {page}'
+            if len(title) > 4:
+                title.pop(0)
+            title.append(message)
+            try:
+                # will fail if not in jupyter notebook
+                get_ipython
+                clear_output(wait=True)
+            except:
+                # will work in shell
+                os.system('clear')
+            print('\n'.join(title))
+            # parse
            img_list, href_list = parse_page(page, connectivity_cache, config)
            for img in img_list:
                main_img_list.append(img)