adapting to run in jupyter notebook
This commit is contained in:
parent
7f6953d6c4
commit
d456a4ab28
19
spider.py
19
spider.py
|
@ -2,7 +2,9 @@
|
||||||
""" spiderman """
|
""" spiderman """
|
||||||
|
|
||||||
import configparser
|
import configparser
|
||||||
|
import os
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
from IPython.display import display, HTML
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
@ -64,9 +66,24 @@ def main():
|
||||||
def page_processing(discovered, indexed, main_img_list, main_href_list, connectivity_cache, config):
|
def page_processing(discovered, indexed, main_img_list, main_href_list, connectivity_cache, config):
|
||||||
""" start the main loop to discover new pages """
|
""" start the main loop to discover new pages """
|
||||||
request_timeout = config['request_timeout']
|
request_timeout = config['request_timeout']
|
||||||
|
# status
|
||||||
|
title = []
|
||||||
for page in discovered:
|
for page in discovered:
|
||||||
if page not in indexed:
|
if page not in indexed:
|
||||||
print(f'parsing [{len(indexed)}]/[{len(discovered)}] {page}')
|
# progress
|
||||||
|
message = f'parsing [{len(indexed)}]/[{len(discovered)}] {page}'
|
||||||
|
if len(title) > 4:
|
||||||
|
title.pop(0)
|
||||||
|
title.append(message)
|
||||||
|
try:
|
||||||
|
# will fail if not in jupyter notebook
|
||||||
|
get_ipython
|
||||||
|
clear_output(wait=True)
|
||||||
|
except:
|
||||||
|
# will work in shell
|
||||||
|
os.system('clear')
|
||||||
|
print('\n'.join(title))
|
||||||
|
# parse
|
||||||
img_list, href_list = parse_page(page, connectivity_cache, config)
|
img_list, href_list = parse_page(page, connectivity_cache, config)
|
||||||
for img in img_list:
|
for img in img_list:
|
||||||
main_img_list.append(img)
|
main_img_list.append(img)
|
||||||
|
|
Loading…
Reference in New Issue