Python Multiprocessing in web crawler - python

I am trying to implement multiprocessing in my web crawler, what I usually see online is sending the url as args into the function of map or map_async or apply_asyn. The data I am crawling is in the table, thus, I extract them by doing two times beautifulsoup find_all for row and column. Since the data I am crawling sometime is in one page which only require one url. I try to use the return list from Find_all as args for map_async, but the error occur showing "Fatal Python error: Cannot recover from stackoverflow."
The error occurred on the following line
return_list = pool.map_async(func, Species_all_recorded_data_List)
How could I solve it or where should the multiprocessing be implemented will be better?
The second problem is that if I put some code above the function crawl_all_data_mp, when it execute the pool = Pool(), all the code above will execute. I solved it by simply move all the other code under that function. It might not be correct since I still can't really run the code due to the first error.
Looking for your advice
My code:
(1) Function to call for web crawling
from tkinter import filedialog
from tkinter import *
import csv
import os.path
from os import path
from Index import *
from Dragonfly import *
import codecs
from multiprocessing import Process, Value
#\ multiprocessing ver
def multiprocessing_row_data(Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID, page, Species_all_record_data_Data_Set):
global DataCNT, stop_crawl_all_data_mp
tmp_List = Species_all_record_data_Data_Set.find_all('td')
# End condition
# 1.no data in next page
# 2.for update to find unti the old data by inspecting its ID
# 3.if it count over the the limit count
id = tmp_List[0].text
if (len(id) == 0) or (DataCNT >= expecting_CNT)or (DataCNT >= Limit_CNT):
print(' --Finish crawl--' + ' crawl to page: ' + str(page) + ", ID: " + id + ", count: " + str(DataCNT))
stop_crawl_all_data_mp = True
raise StopIteration
# access the same value in memory when doing multiprocessing
with DataCNT.getlock():
DataCNT.value += 1
response_DetailedInfo = session.post(general_url + Detailed_discriptions_url + id, headers=headers)
soup2 = BeautifulSoup(response_DetailedInfo.text, 'html.parser')
print("Current finished datas >> " + str(DataCNT.value) + " /" + str(Total_num) + " (" + str(DataCNT.value * 100 / Total_num) + "%)", end='\r')
return DetailedTableInfo(tmp_List[0].text, tmp_List[1].text, tmp_List[2].text, tmp_List[3].text, tmp_List[4].text, tmp_List[5].text, tmp_List[7].text, tmp_List[6].text,
soup2.find(id='R_LAT').get('value'),
soup2.find(id='R_LNG').get('value'),
Web_rawl_Species_family_name,
Web_rawl_Species_name,
soup2.find(id='R_MEMO').get('value'))
def crawl_all_data_mp(Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID):
page = 0
DataList = []
while not stop_crawl_all_data_mp:
pool = multiprocessing.Pool(10)
Species_all_recorded_data = session.post( general_url +
species_all_record_data_first_url +
species_all_record_data_page_url + str(page) +
species_all_record_data_species_url +
Species_class_key[Web_rawl_Species_family_name] +
Species_key[Web_rawl_Species_name],
headers=headers)
soup = BeautifulSoup(Species_all_recorded_data.text, 'html.parser')
Species_all_recorded_data_List = soup.find_all(id='theRow')
func = partial(multiprocessing_row_data, Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID, page)
return_list = pool.map_async(func, Species_all_recorded_data_List)
DataList.append(list(filter(None, return_list.get())))
page += 1
# make sure whe main is finished, subfunctions still keep rolling on
pool.close()
pool.join()
return [DataList, page]
(2) main
it goes wrong on the following line for calling the function above
[datatmpList, page] = crawl_all_data_mp(Input_species_famliy, Input_species, Total_num, limit_cnt, expecting_CNT, oldID)
the main code:
# --main--
if __name__ == '__main__':
# setting
Input_species_famliy = "細蟌科"
Input_species = "四斑細蟌"
limit_cnt = 6000
folder = 'Crawl_Data\\' + Species_class_key[Input_species_famliy]
File_name = folder + "\\" + Species_class_key[Input_species_famliy] + Species_key[Input_species] +'.csv'
oldID = 0
oldData_len = 0
print("--Start crawl-- " + Input_species_famliy + " " + Input_species)
print("[folder]: " + folder)
stop_crawl_all_data_mp = False
# check the file exist or not
file_check = path.exists(current_path + "\\" + File_name)
# get the Old ID
if file_check:
file_size = os.stat(current_path + "\\" + File_name).st_size
if not file_size == 0:
with open(File_name, newline='', errors = "ignore") as F:
R = csv.reader(F)
oldData = [line for line in R]
oldID = oldData[0][0]
oldData_len = len(oldData)-1
# login
Login_Web(myaccount, mypassword)
# find the total number of the species_input (expect executed one time)
Species_total_num_Dict = Find_species_total_data()
# get the data
Total_num = int(Species_total_num_Dict[Input_species])
#[datatmpList, page] = crawl_all_data(Input_species_famliy, Input_species, Total_num, limit_cnt, oldID)
expecting_CNT = Total_num - oldData_len # get the total number of data need to be update ot crawl
[datatmpList, page] = crawl_all_data_mp(Input_species_famliy, Input_species, Total_num, limit_cnt, expecting_CNT, oldID)
Data = []
for Data_tmp in datatmpList:
Data.append([Data_tmp.SpeciesFamily,
Data_tmp.Species,
Data_tmp.IdNumber,
Data_tmp.Dates,
Data_tmp.Times,
Data_tmp.User,
Data_tmp.City,
Data_tmp.Dictrict,
Data_tmp.Place,
Data_tmp.Altitude,
Data_tmp.Latitude,
Data_tmp.Longitude,
Data_tmp.Description
])
#auto make the directories
newDir = current_path + "\\" + folder
if (not os.path.isdir(newDir)):
os.mkdir(newDir)
# 'a' stands for append, which can append the new data to old one
with open(File_name, mode='a', newline='', errors = "ignore") as employee_file:
employee_writer = csv.writer(employee_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
# init , for there is no file exists or the file is empty
if ((not file_check) or (file_size == 0)):
employee_writer.writerow(CSV_Head)
employee_writer.writerows(Data)
# for inserting the data into the old one
else:
for i in range(0, len(Data)):
oldData.insert(i, Data[i])
employee_writer.writerows(oldData)

Related

Extracting Gzip files from requests with multithreading

I'm trying to extract a bunch (62.000) gzip files. In these files is a text document formatted as JSON. Right now I'm downloading all these files with the requests module with multithreading:
def fetch_file(url,filename):
try:
html = requests.get(url,stream=True,allow_redirects=True)
open('Streams Total Daily/'+filename+'.json.gz','wb').write(html.content)
return html.status_code
except requests.exceptions.RequestException as e:
return e
def get_streams():
threads = []
with ThreadPoolExecutor(max_workers=10) as executor:
for uri in country_uris:
split = uri.split('/')
filename = 'streams_'+split[1] + '_' +split[4]+'_'+split[5]+'_'+split[6] + '_'+ split[7]
url = f"{link}{uri}?access_token={access_token}"
threads.append(executor.submit(fetch_file,url,filename))
for task in as_completed(threads):
print(task.result())
get_streams()
I have some code that can loop through the folder where the files are placed, but this takes a long time with 62000 files. I've tried some versions passing response.content through gzip.GzipFile(), but this just gives me empty files.
def fetch_file(url,filename):
try:
html = requests.get(url,stream=True,allow_redirects=True)
gz = gzip.GzipFile(fileobj=html.content)
with open('test/'+filename + '.json','wb') as out:
out.write(gz.read())
return html.status_code
except requests.exceptions.RequestException as e:
return e
def get_streams():
threads = []
with ThreadPoolExecutor(max_workers=10) as executor:
for uri in country_uris:
split = uri.split('/')
filename = 'streams_'+split[1] + '_' +split[4]+'_'+split[5]+'_'+split[6] + '_'+ split[7]
url = f"{link}{uri}?access_token={access_token}"
threads.append(executor.submit(fetch_file,url,filename))
for task in as_completed(threads):
print(task.result())
get_streams()
Does anyone have an idea on how to handle this? Any suggestions or solutions are much appreciated!
What worked for me, in the end, was the Zlib module. First getting a bytes response with response.content, then decompressing the data with zlib.decompress(response.content, 16 +zlib.MAX_WBITS) and finally writing the decompressed data to a .json file:
def get_files(i):
url = f"{url}{i}"
elements = i.split('/')
name = elements[1] + '_' +elements[3] + '_' + elements[4] + '_' + elements[5]+ '_' + elements[6] + '_' +elements[7]
try:
response = requests.get(url=url,headers=headers,allow_redirects=True,).content
decompressed_data = zlib.decompress(response, 16 + zlib.MAX_WBITS)
with open(f"Streams Total Daily/{name}.json",'wb') as out:
out.write(decompressed_data)
except requests.exceptions.RequestException as e:
return e
def runner():
threads =[]
with ThreadPoolExecutor(max_workers=10) as executor:
for i in country_files:
threads.append(executor.submit(get_files,i))
runner()

Can I pause a scroll function in selenium, scrape the current data, and then continue scrolling later in the script?

I am a student working on a scraping project and I am having trouble completing my script because it fills my computer's memory with all of the data is stores.
It currently stores all of my data until the end, so my solution to this would be to break up the scrape into smaller bits and then write out the data periodically so it does not just continue to make one big list and then write out at the end.
In order to do this, I would need to stop my scroll method, scrape the loaded profiles, write out the data that I have collected, and then repeat this process without duplicating my data. It would be appreciated if someone could show me how to do this. Thank you for your help :)
Here's my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
Data = []
driver = webdriver.Chrome()
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
info = Name, IssuedBy, CertificationorDesignaationNumber, CertfiedorDesignatedSince, AccreditedBy, Expires + "\n"
Data.extend(info)
driver.close()
driver.switch_to.window(driver.window_handles[0])
with open("Spredsheet.txt", "w") as output:
output.write(','.join(Data))
driver.close()
Test.py
Displaying Test.py.
Try the below approach using requests and beautifulsoup. In the below script i have used the API URL fetched from website itself for ex:-API URL
First it will create the URL(refer first url) for first iteration, add headers and data in .csv file.
Second iteration it will again create the URL(refer second url) with 2 extra params start_on_page=20 & show_per_page=20 where start_on_page number 20 is incremented by 20 on each iteration and show_per_page = 100 defaulted to extract 100 records per iteration so on till all the data dumped in to the .csv file.second iteration API URL
Script is dumping 4 things number, name, location and profile url.
On each iteration data will be appended to .csv file , so your memory issue will get resolved by this approach.
Do not forget to add your system path in file_path variable where do you want to create .csv file before running the script.
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
import csv
def scrap_directory_data():
list_of_credentials = []
file_path = ''
file_name = 'credential_list.csv'
count = 0
page_number = 0
page_size = 100
create_url = ''
main_url = 'https://directory.bcsp.org/search_results.php?'
first_iteration_url = 'first_name=&last_name=&city=&state=&country=&certification=&unauthorized=0&retired=0&specialties=&industries='
number_of_records = 0
csv_headers = ['#','Name','Location','Profile URL']
while True:
if count == 0:
create_url = main_url + first_iteration_url
print('-' * 100)
print('1 iteration URL created: ' + create_url)
print('-' * 100)
else:
create_url = main_url + 'start_on_page=' + str(page_number) + '&show_per_page=' + str(page_size) + '&' + first_iteration_url
print('-' * 100)
print('Other then first iteration URL created: ' + create_url)
print('-' * 100)
page = requests.get(create_url,verify=False)
extracted_text = bs(page.text, 'lxml')
result = extracted_text.find_all('tr')
if len(result) > 0:
for idx, data in enumerate(result):
if idx > 0:
number_of_records +=1
name = data.contents[1].text
location = data.contents[3].text
profile_url = data.contents[5].contents[0].attrs['href']
list_of_credentials.append({
'#':number_of_records,
'Name':name,
'Location': location,
'Profile URL': profile_url
})
print(data)
with open(file_path + file_name ,'a+') as cred_CSV:
csvwriter = csv.DictWriter(cred_CSV, delimiter=',',lineterminator='\n',fieldnames=csv_headers)
if idx == 0 and count == 0:
print('Writing CSV header now...')
csvwriter.writeheader()
else:
for item in list_of_credentials:
print('Writing data rows now..')
print(item)
csvwriter.writerow(item)
list_of_credentials = []
count +=1
page_number +=20
scrap_directory_data()

np.arange returns <Response 200> instead of value

I'm trying to write a script that scrapes the text of multiple webpages with slightly differing URLs. I want to go through the pages with an np.arange function that inserts a string into the URL. But there must be something wrong with the URL the script is composing. In the document, that stores the scraped text, it scrapes just messages like "this site does not exist anymore". The steps I have taken to come closer to the solution are detailed below. Here is my code.
from bs4 import BeautifulSoup
import numpy as np
import datetime
from time import sleep
from random import randint
datum = datetime.datetime.now()
pages = np.arange(1, 20, 1)
datum_jetzt = datum.strftime("%Y") + "-" + datum.strftime("%m") + "-" + datum.strftime("%d")
url = "https://www.shabex.ch/pub/" + datum_jetzt + "/index-"
results = requests.get(url)
file_name = "" + datum.strftime("%Y") + "-" + datum.strftime("%m") + "-" + datum.strftime("%d") + "-index.htm"
for page in pages:
page = requests.get("https://www.shabex.ch/pub/" + datum_jetzt + "/index-" + str(page) + ".htm")
soup = BeautifulSoup(results.text, "html.parser")
texte = soup.get_text()
sleep(randint(2,5))
f = open(file_name, "a")
f.write(texte)
f.close
I found that if I find enter print("https://www.shabex.ch/pub/" + datum_jetzt + "/index-" + str(page) + ".htm") in the console, I get https://www.shabex.ch/pub/2020-05-18/index-<Response [200]>.htm. So the np.arange function returns the response of the webserver instead of the value I seek.
Where have I gone wrong?

how to get "today's" quotes from yahoo via pandas-datareader

for using pandas-datareader with yahoo, when I have start and end as the same date I get no information returned when I ask on that date. If I ask a day later, it works. But I want today's close today.
import sys
from sqlalchemy import *
import os
import datetime
import pandas_datareader.data as web
end = datetime.datetime(2015, 10, 15)
start = datetime.datetime(2015, 10, 15)
path = 'c:\\python34\\myprojects\\msis\\'
try:
os.mkdir(path)
except:
pass
fname = path + 'test.txt'
fhand = open(fname, 'w')
engine = create_engine('mysql+mysqlconnector://root:#localhost /stockinfo')
connection = engine.connect()
result1 = engine.execute("select symbol from equities where daily = 'Y'")
for sqlrow in result1:
try:
info = web.DataReader(sqlrow[0], 'yahoo', start, end)
print (info)
close = info['Close'].ix['2015-10-14']
print ("=========================" + str(round(close,4)))
answer = "Closing price for " + sqlrow[0] + " is " + str(round(close,4)) + "\n"
except:
answer = "No success for " + sqlrow[0] + "\n"
fhand.write(answer)
# result2 = engine.execute("update holdings set lasrprice = " + round(close,4) + " where symbol = '" + sqlrow[0] + "'")
# result2.close()
result1.close()
fhand.close()
The code takes the second "except" route.
What am I doing wrong/what is happening?

Python script works but fails after compilation (Windows)

I am working on a script to scrape a website, the problem is that it works normally when I run it with the interpreter, however after compiling it (PyInstaller or Py2exe) it fails, it appears to be that mechanize / requests both fail to keep the session alive.
I have hidden my username and password here, but I did put them correctly in the compiled code
import requests
from bs4 import BeautifulSoup as bs
from sys import argv
import re
import logging
url = argv[1]
payload = {"userName": "real_username", "password": "realpassword"}
session = requests.session()
resp = session.post("http://website.net/login.do", data=payload)
if "forgot" in resp.content:
logging.error("Login failed")
exit()
resp = session.get(url)
soup = bs(resp.content)
urlM = url[:url.find("?") + 1] + "page=(PLACEHOLDER)&" + \
url[url.find("?") + 1:]
# Get number of pages
regex = re.compile("\|.*\|\sof\s(\d+)")
script = str(soup.findAll("script")[1])
epNum = int(re.findall(regex, script)[0]) # Number of EPs
pagesNum = epNum // 50
links = []
# Get list of links
# If number of EPs > 50, more than one page
if pagesNum == 0:
links = [url]
else:
for i in range(1, pagesNum + 2):
url = urlM.replace("(PLACEHOLDER)", str(i))
links.append(url)
# Loop over the links and extract info: ID, NAME, START_DATE, END_DATE
raw_info = []
for pos, link in enumerate(links):
print "Processing page %d" % (pos + 1)
sp = bs(session.get(link).content)
table = sp.table.table
raw_info.extend(table.findAll("td"))
epURL = "http://www.website.net/exchange/viewep.do?operation"\
"=executeAction&epId="
# Final data extraction
raw_info = map(str, raw_info)
ids = [re.findall("\d+", i)[0] for i in raw_info[::4]]
names = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[1::4]]
start_dates = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[2::4]]
end_dates = [re.findall("<td>(.*)</td", i)[0] for i in raw_info[3::4]]
emails = []
eplinks = [epURL + str(i) for i in ids]
print names
The error happens on the level of epNum variable, this means as I figured that the HTML page is not the one I requested, but it works normally on linux script and compiled, work on widows as script but fails when compiled.
The py2exe tutorial mentions that you need MSVCR90.dll, did you check its present on the PC?

Categories

Resources