adapting to run in jupyter notebook
This commit is contained in:
parent
7f6953d6c4
commit
d456a4ab28
19
spider.py
19
spider.py
|
@ -2,7 +2,9 @@
|
|||
""" spiderman """
|
||||
|
||||
import configparser
|
||||
import os
|
||||
from time import sleep
|
||||
from IPython.display import display, HTML
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
@ -64,9 +66,24 @@ def main():
|
|||
def page_processing(discovered, indexed, main_img_list, main_href_list, connectivity_cache, config):
|
||||
""" start the main loop to discover new pages """
|
||||
request_timeout = config['request_timeout']
|
||||
# status
|
||||
title = []
|
||||
for page in discovered:
|
||||
if page not in indexed:
|
||||
print(f'parsing [{len(indexed)}]/[{len(discovered)}] {page}')
|
||||
# progress
|
||||
message = f'parsing [{len(indexed)}]/[{len(discovered)}] {page}'
|
||||
if len(title) > 4:
|
||||
title.pop(0)
|
||||
title.append(message)
|
||||
try:
|
||||
# will fail if not in jupyter notebook
|
||||
get_ipython
|
||||
clear_output(wait=True)
|
||||
except:
|
||||
# will work in shell
|
||||
os.system('clear')
|
||||
print('\n'.join(title))
|
||||
# parse
|
||||
img_list, href_list = parse_page(page, connectivity_cache, config)
|
||||
for img in img_list:
|
||||
main_img_list.append(img)
|
||||
|
|
Loading…
Reference in New Issue