adapting to run in jupyter notebook

This commit is contained in:
simon 2021-03-24 14:14:45 +07:00
parent 7f6953d6c4
commit d456a4ab28
1 changed files with 18 additions and 1 deletions

View File

@ -2,7 +2,9 @@
""" spiderman """
import configparser
import os
from time import sleep
from IPython.display import display, HTML
import requests
from bs4 import BeautifulSoup
@ -64,9 +66,24 @@ def main():
def page_processing(discovered, indexed, main_img_list, main_href_list, connectivity_cache, config):
""" start the main loop to discover new pages """
request_timeout = config['request_timeout']
# status
title = []
for page in discovered:
if page not in indexed:
print(f'parsing [{len(indexed)}]/[{len(discovered)}] {page}')
# progress
message = f'parsing [{len(indexed)}]/[{len(discovered)}] {page}'
if len(title) > 4:
title.pop(0)
title.append(message)
try:
# will fail if not in jupyter notebook
get_ipython
clear_output(wait=True)
except:
# will work in shell
os.system('clear')
print('\n'.join(title))
# parse
img_list, href_list = parse_page(page, connectivity_cache, config)
for img in img_list:
main_img_list.append(img)