I have a function executed in parallel using the multiprocessing package, while another one is executed in the main process. The parallel function runs until the main one is finished. Both of them work, but not together. Both functions only share one argument in common (a global variable), and both use the BeautifulSoup package. Also, the functions have internal local variables which have the same name, but I don't think this is a problem.
The declaration of the functions is:
stop_event, output = mp.Event(), mp.Queue()
count_price_change = mp.Process(target=count, args=(stop_event, output, data_2, header_2, driver, lancer))
count_price_change.start()
start = time.time()
collected = create_array_features(data_1, header_1, prices, change, changepct, volume, lancer)
#PROBLEM IS HERE, eventually with lancer ??
stop = time.time()
time.sleep(frequency_get - (stop - start))
stop_event.set()
counts_changes = output.get()
count_price_change.join()
count_price_change.terminate()
The exact error I get is:
Traceback (most recent call last):
File "/usr/lib/python3.6/code.py", line 91, in runcode
exec(code, self.locals)
File "<input>", line 6, in <module>
File "<input>", line 80, in create_array_features
File "/usr/local/lib/python3.6/dist-packages/bs4/__init__.py", line 300, in __init__
markup, from_encoding, exclude_encodings=exclude_encodings)):
File "/usr/local/lib/python3.6/dist-packages/bs4/builder/_htmlparser.py", line 240, in prepare_markup
exclude_encodings=exclude_encodings)
File "/usr/local/lib/python3.6/dist-packages/bs4/dammit.py", line 374, in __init__
for encoding in self.detector.encodings:
File "/usr/local/lib/python3.6/dist-packages/bs4/dammit.py", line 265, in encodings
self.markup, self.is_html)
File "/usr/local/lib/python3.6/dist-packages/bs4/dammit.py", line 323, in find_declared_encoding
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
TypeError: expected string or bytes-like object
I have tried minimum reproducible examples but they all work. I am unable to figure out why I get an error. The whole code is (start at the bottom):
def parse_html(names, driver):
# need to make it wait until elements appear otherwise doesn't catch anything
list_names = []
for i in range(len(names)):
html_name = names[i].get_attribute('innerHTML')
soup = BeautifulSoup(html_name, "html.parser")
match = soup.find('a').text
list_names.append(match)
lancer = datetime.datetime.now().replace(microsecond=0).isoformat(' ')
prices = driver.find_elements_by_css_selector('div.item-field.item-price')
change = driver.find_elements_by_css_selector('div.item-field.item-change.pn--color')
changepct = driver.find_elements_by_css_selector('div.item-field.item-changepct.pn--color')
volume = driver.find_elements_by_css_selector('div.item-field.item-volume.is-made')
return list_names, lancer, prices, change, changepct, volume
def create_array_features(data, header, prices, change, changepct, volume, lancer):
# create an array of features
try:
list_to_parse = data.columns.get_level_values(0).unique()
except:
list_to_parse = data.get_level_values(0).unique() #this is for when the dataframe is empty
features = []
for i in range(len(list_to_parse)):
# prices
html = prices[i].get_attribute('innerHTML')
soup = BeautifulSoup(html, "html.parser")
soup = [text for text in soup.find_all(text=True) if text.parent.name != "label"][1]
match = prq.check_unit(soup)
features.append(match)
#parse some other stuff...
collected = pd.DataFrame([features], columns=header)
collected['time'] = pd.to_datetime(lancer)
collected = collected.set_index('time')
return collected
def count(stop_event, output, data_2, header_2, driver, lancer):
try:
list_to_parse = data_2.columns.get_level_values(0).unique()
except:
list_to_parse = data_2.get_level_values(0).unique() #this is for when the dataframe is empty
change_count_prices = [0 for i in range(len(list_to_parse))]
change_count_volume = [0 for i in range(len(list_to_parse))]
last_prices = [None for i in range(len(list_to_parse))]
last_volumes = [None for i in range(len(list_to_parse))]
while not stop_event.is_set():
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'section.watchlist__container.expanded')))
names_ = driver.find_elements_by_css_selector('div.item-symbol-inner')
_, _, prices, _, _, volume = parse_html(names_, driver) #each time must update the price in order to check if has changed
for i in range(len(list_to_parse)):
html = prices[i].get_attribute('innerHTML')
soup = BeautifulSoup(html, "html.parser")
soup = [text for text in soup.find_all(text=True) if text.parent.name != "label"][1]
match = prq.check_unit(soup)
if match != last_prices[i]:
change_count_prices[i] =+ 1
last_prices[i] = match
#do some other stuff
change_count_prices.extend(change_count_volume)
print("exited and will stop")
collected = pd.DataFrame([change_count_prices], columns=header_2)
collected['time'] = pd.to_datetime(lancer)
collected = collected.set_index('time')
output.put(collected) #LIFO queue with ouput that comes at the end when event is set
def collect(data_1, data_2, header_1, header_2, frequency_get, driver,
lancer, prices, change, changepct, volume):
stop_event, output = mp.Event(), mp.Queue()
count_price_change = mp.Process(target=count, args=(stop_event, output, data_2, header_2, driver, lancer))
count_price_change.start()
start = time.time()
collected = create_array_features(data_1, header_1, prices, change, changepct, volume, lancer)
stop = time.time()
time.sleep(frequency_get - (stop - start))
stop_event.set()
counts_changes = output.get()
count_price_change.join()
count_price_change.terminate()
Because i cant recreate your problem , generally i will suggest not to use shared variables between processes ,except you know the state of each every time.Also you need always to have the 2 functions or scripts in the same directory , because many fork issues can show up like errors.
So generally you can solve issues like that by communicating between main and child processes like this with multyprocessing Pipe(),and share real-time the state of the variable:
parent_conn, child_conn = Pipe()
p = Process(target = plot_map_function,args =(self.variable1,self.variable2,child_conn))
p.daemon=True
p.start()
parent_conn.recv()
p.join()
in other function :
def plot_map_function(var1,var2,conn_to_parent):
#do something with variables
conn_to_parent.send(variable_i_want_to_send)
conn_to_parent.close()
Hope it helps
In the end, I was able to find the answer. The problem comes from the fact that, Selenium cannot handle multiple tabs in parallel, having the same driver. In order to use Selenium in multithreading, you must use one different driver per thread. Which is not was I was making above
Related
I have a json file with list of urls.
After reading documentation, I figured, multiproccessing.pool is the best option for me.
I ran 10 urls, with multiprocessing.Pool(10), I was expecting the results would pretty much be instant, but it takes me about 12 seconds to complete everything, not sure if I am using it correctly, but below is my code.
def download_site(data, outputArr, proxyArr=None):
session = requests.Session()
# print("Scraping last name {lastName}".format(lastName=data['lastName']))
userAgents = open('user-agents.txt').read().split('\n')
params = (
('Name', data['lastName']),
('type', 'P'),
)
url = 'someurl'
if not proxyArr:
proxyArr = {
'http': data['proxy']['http']
}
try:
with session.get(url, params=params, proxies=proxyArr, headers=headers) as response:
name = multiprocessing.current_process().name
try:
content = response.json()
loadJson = json.loads(content)['nameBirthDetails']
for case in loadJson:
dateFile = loadJson[case]['dateFiled']
year = int(dateFile.split('/')[-1])
if year > 2018:
profileDetailUrlParams = (
('caseId',loadJson[case]['caseYear']),
('caseType', 'WC'),
('caseNumber', loadJson[case]['caseSeqNbr']),
)
loadJson[case]['caseDetail'] = getAttorneyData(profileDetailUrlParams, session, proxyArr)
outputArr.append(loadJson[case])
# print("Total Scraped Results so far ", len(outputArr))
except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError):
print("Error Found JSON DECODE ERROR - passing for last name", data['lastName'])
except simplejson.errors.JSONDecodeError:
print("Found Simple Json Error", data['lastName'])
pass
# newProxy = generate_random_proxy()
# download_site(data, outputArr, newProxy)
except:
raise
def queueList(sites):
manager = multiprocessing.Manager()
outputArr = manager.list()
functionMain = partial(download_site, outputArr = outputArr)
p = multiprocessing.Pool(10)
records = p.map(functionMain, sites)
p.terminate()
p.join()
if __name__ == "__main__":
outputArr = []
fileData = json.loads(open('lastNamesWithProxy.json').read())[:10]
start_time = time.time()
queueList(fileData)
duration = time.time() - start_time
print(f"Downloaded {len(fileData)} in {duration} seconds")
The function download_site, is the function where I fetch a list via requests library - then for each item in the list, I make another requests aka function getAttorneyData
How can I further hone this to run faster? I have a high-end computer so CPU shouldn't be an issue, I want to use it to it's max potential.
My goal is to be able to spawn 10 workers and consume each worker with each request. So, 10 requests would really take me 1-2 seconds instead of 12, which is currently.
I'm trying to build a list of parent/comment pairs from the publicly available Reddit data set.
I have a CSV file which I load into a Pandas dataframe which contains rows of the comments with the parent and child id, as well as the child comment. The data is loaded using the following block of code:
import os
import multiprocessing as mp
import numpy as np
import pandas as pd
sourcePATH = r'C:\'
workingFILE = r'\output-pt1.csv'
# filepaths
input_file = sourcePATH + workingFILE
data_df = pd.read_csv(input_file,header=None,names=['PostIDX','ParentIDX','Comment','Score','Controversiality'])
The aim is to scan through each row in the dataframe and using the parent id to search through the rest of the dataframe to see if their is a parent comment present. If it is I then store the child and parent comments in a tuple with some other information. This will then be added to a list which will then be written out to a csv file at the end. To do this I use the following code:
def checkChildParent(ParentIDX_curr, ChildIDX_curr,ChildComment_curr,ChildScore_curr,ChildCont_curr):
idx = data_df.loc[data_df['PostIDX'] == ParentIDX_curr]
if idx.empty is False:
ParentComment = idx.iloc[0,2]
ParentScore = idx.iloc[0,3]
ParentCont = idx.iloc[0,4]
outPut.put([ParentIDX_curr[0], ParentComment,ParentScore,ParentCont,ChildIDX_curr[0], ChildComment_curr[0],ChildScore_curr[0],ChildCont_curr[0]])
if __name__ == '__main__':
print('Process started')
t_start_init = time.time()
t_start = time.time()
noCores = 1
#pool = mp.Pool(processes=noCores)
update_freq = 100
n = 1000
#n = round(len(data_df)/8)
flag_create = 0
flag_run = 0
i = 0
outPut = mp.Queue()
#parent_child_df = pd.DataFrame()
#parent_child_df.coumns = ['PostIDX','ParentIDX']
while i < n:
#print(i)
procs = []
ParentIDX = []
ParentComment = []
ParentScore = []
ParentCont = []
ChildIDX = []
ChildComment = []
ChildScore = []
ChildCont = []
for worker in range(0,noCores):
ParentIDX.append(data_df.iloc[i,1])
ChildIDX.append(data_df.iloc[i,0])
ChildComment.append(data_df.iloc[i,2])
ChildScore.append(data_df.iloc[i,3])
ChildCont.append(data_df.iloc[i,4])
i = i + 1
#when I call the function this way it returns the expected matches
#checkChildParent(ParentIDX,ChildIDX,ChildComment,
# ChildScore,ChildCont)
#when I call the function with Process function nothing appears to be happening
for proc in range(0,noCores):
p = mp.Process(target = checkChildParent, args=(ParentIDX[proc],ChildIDX[proc],ChildComment[proc],ChildScore[proc],ChildCont[proc]))
procs.append(p)
p.start()
#for p in procs:
# p.join()
if outPut.empty() is False:
print(outPut.get())
At the top of the file is a function which scans the dataframe for a given row and returns the tuple of the matched parent and child comment if it was found. If I call this function normally then it works fine, however when I call the function using the Process function it doesn't match anything!. I'm guessing its the form the arguments that are being passed to the function is being passed to the function that is causing the issue, but I have been trying to debug this all afternoon and have failed so far. If anyone has any suggestions then please let me know!
Thanks!
I am trying to use this question for my file processing:
Python multiprocessing safely writing to a file
This is my modification of the code:
def listener(q):
'''listens for messages on the q, writes to file. '''
while 1:
reads = q.get()
if reads == 'kill':
#f.write('killed')
break
for read in reads:
out_bam.write(read)
out_bam.flush()
out_bam.close()
def fetch_reads(line, q):
parts = line[:-1].split('\t')
print(parts)
start,end = int(parts[1])-1,int(parts[2])-1
in_bam = pysam.AlignmentFile(args.bam, mode='rb')
fetched = in_bam.fetch(parts[0], start, end)
reads = [read for read in fetched if (read.cigarstring and read.pos >= start and read.pos < end and 'S' not in read.cigarstring)]
in_bam.close()
q.put(reads)
return reads
#must use Manager queue here, or will not work
manager = mp.Manager()
q = manager.Queue()
if not args.threads:
threads = 1
else:
threads = int(args.threads)
pool = mp.Pool(threads+1)
#put listener to work first
watcher = pool.apply_async(listener, (q,))
with open(args.bed,'r') as bed:
jobs = []
cnt = 0
for line in bed:
# Fire off the read fetchings
job = pool.apply_async(fetch_reads, (line, q))
jobs.append(job)
cnt += 1
if cnt > 10000:
break
# collect results from the workers through the pool result queue
for job in jobs:
job.get()
print('get')
#now we are done, kill the listener
q.put('kill')
pool.close()
The differences in that I am opening and closing the file in the function since otherwise I get unusual errors from bgzip.
At first, print(parts) and print('get') are interchangeably printed (more or less), then there are less and less prints of 'get'. Ultimately the code hangs, and nothing is printed (all the parts are printed, but 'get' simply doesn't print anymore). The output file remains zero bytes.
Can anyone lend a hand? Cheers!
there is about 70% chance shows error:
res=pool.map(feng,urls)
File "c:\Python27\lib\multiprocessing\pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "c:\Python27\lib\multiprocessing\pool.py", line 567, in get
raise self._value
IndexError: list index out of range
don't know why,if data less then 100,only 5%chance show that message.any one have idea how to improve?
#coding:utf-8
import multiprocessing
import requests
import bs4
import re
import string
root_url = 'http://www.haoshiwen.org'
#index_url = root_url+'/type.php?c=1'
def xianqin_url():
f = 0
h = 0
x = 0
y = 0
b = []
l=[]
for i in range(1,64):#页数
index_url=root_url+'/type.php?c=1'+'&page='+"%s" % i
response = requests.get(index_url)
soup = bs4.BeautifulSoup(response.text,"html.parser")
x = [a.attrs.get('href') for a in soup.select('div.sons a[href^=/]')]#取出每一页的div是sons的链接
c=len(x)#一共c个链接
j=0
for j in range(c):
url = root_url+x[j]
us = str(url)
print "收集到%s" % us
l.append(url) #pool = multiprocessing.Pool(8)
return l
def feng (url) :
response = requests.get(url)
response.encoding='utf-8'
#print response.text
soup = bs4.BeautifulSoup(response.text, "html.parser")
#content = soup.select('div.shileft')
qq=str(soup)
soupout = re.findall(r"原文(.+?)</div>",qq,re.S)#以“原文”开头<div>结尾的字段
#print soupout[1]
content=str(soupout[1])
b="风"
cc=content.count(b,0,len(content))
return cc
def start_process():
print 'Starting',multiprocessing.current_process().name
def feng (url) :
response = requests.get(url)
response.encoding='utf-8'
#print response.text
soup = bs4.BeautifulSoup(response.text, "html.parser")
#content = soup.select('div.shileft')
qq=str(soup)
soupout = re.findall(r"原文(.+?)</div>",qq,re.S)#以“原文”开头<div>结尾的字段
#print soupout[1]
content=str(soupout[1])
b="风"
c="花"
d="雪"
e="月"
f=content.count(b,0,len(content))
h=content.count(c,0,len(content))
x=content.count(d,0,len(content))
y=content.count(e,0,len(content))
return f,h,x,y
def find(urls):
r= [0,0,0,0]
pool=multiprocessing.Pool()
res=pool.map4(feng,urls)
for i in range(len(res)):
r=map(lambda (a,b):a+b, zip(r,res[i]))
return r
if __name__=="__main__":
print "开始收集网址"
qurls=xianqin_url()
print "收集到%s个链接" % len(qurls)
print "开始匹配先秦诗文"
find(qurls)
print '''
%s篇先秦文章中:
---------------------------
风有:%s
花有:%s
雪有:%s
月有:%s
数据来源:%s
''' % (len(qurls),find(qurls)[0],find(qurls)[1],find(qurls)[2],find(qurls)[3],root_url)
stackoverflow :Body cannot contain "`pool ma p".
changed it as res=pool.map4(feng,urls)
i'm trying to get some sub string from this website,with multiprocessing.
Indeed, multiprocessing makes it a bit hard to debug as you don't see where the index out of bound error occurred (the error message makes it appear as if it happened internally in the multiprocessing module).
In some cases this line:
content=str(soupout[1])
raises an index out of bound, because soupout is an empty list. If you change it to
if len(soupout) == 0:
return None
and then remove the None that were returned by changing
res=pool.map(feng,urls)
into
res = pool.map(feng,urls)
res = [r for r in res if r is not None]
then you can avoid the error. That said. You probably want to find out the root cause why re.findall returned an empty list. It is certainly a better idea to select the node with beatifulsoup than with regex, as generally matching with bs4 is more stable, especially if the website slightly changes their markup (e.g. whitespaces, etc.)
Update:
why is soupout is an empty list? When I didn't use pool.map never I have this error message shown
This is probably because you hammer the web server too fast. In a comment you mention that you sometimes get 504 in response.status_code. 504 means Gateway Time-out: The server was acting as a gateway or proxy and did not receive a timely response from the upstream server
This is because haoshiwen.org seems to be powered by kangle which is a reverse proxy. Now the reverse proxy handles back all the requests you send him to the web server behind, and if you now start too many processes at once the poor web server cannot handle the flood. Kangle has a default timeout of 60s so as soon as he doesn't get an answer back from the web server within 60s he shows the error you posted.
How do you fix that?
you could limit the number of processes: pool=multiprocessing.Pool(2), you'd need to play around with a good number of processes
at the top of feng(url) you could add a time.sleep(5) so each process waits 5 seconds between each request. Also here you'd need to play around with the sleep time.
My program essentially scrapes images off of websites that I made up. I have 3 functions and each of them scrape images off of a specific website by using a parameter. My program contains the following code.
import requests
from bs4 import BeautifulSoup
from multiprocessing import Process
img1 = []
img2 = []
img3 = []
def my_func1(img_search):
del img1[:]
url1 = "http://www.somewebsite.com/" + str(img_search)
r1 = requests.get(url1)
soup1 = BeautifulSoup(r1.content)
data1 = soup1.find_all("div",{"class":"img"})
for item in data1:
try:
img1.append(item.contents[0].find('img')['src'])
except:
img1.append("img Unknown")
return
def my_func2(img_search):
del img2[:]
url2 = "http://www.somewebsite2.com/" + str(img_search)
r2 = requests.get(url2)
soup2 = BeautifulSoup(r2.content)
data2 = soup2.find_all("div",{"class":"img"})
for item in data2:
try:
img2.append(item.contents[0].find('img')['src'])
except:
img2.append("img Unknown")
return
def my_func3(img_search):
del img3[:]
url3 = "http://www.somewebsite3.com/" + str(img_search)
r3 = requests.get(url3)
soup3 = BeautifulSoup(r3.content)
data3 = soup3.find_all("div",{"class":"img"})
for item in data3:
try:
img3.append(item.contents[0].find('img')['src'])
except:
img3.append("img Unknown")
return
my_func1("orange cat")
my_func2("blue cat")
my_func3("green cat")
print(*img1, sep='\n')
print(*img2, sep='\n')
print(*img3, sep='\n')
The scraping works just fine, but it is quite slow so I decided to use multiprocessing to speed it up, and multiprocessing did in fact speed it up. I essentially replaced the function calls with this
p = Process(target=my_func1, args=("orange cat",))
p.start()
p2 = Process(target=my_func2, args=("blue cat",))
p2.start()
p3 = Process(target=my_func3, args=("green cat",))
p3.start()
p.join()
p2.join()
p3.join()
However, when I print the img1 , img2, and img3 lists they are empty. How would I fix this?
When you use multiprocessing to distribute your work between several processes, each process will run in a separate namespace (a copy of the namespace of the main process). The changes you make in the child-process's namespace will not be reflected in the parent process's namespace. You'll need to use a multiprocessing.Queue or some other synchronization method to pass the data back from the worker processes.
In your example code, your three functions are almost exactly the same, only the web site's domain and the variable names differ. If that's how your real functions look, I suggest using multiprocessing.Pool.map and passing the whole URL to a single function, rather than just passing search terms:
def my_func(search_url):
r = requests.get(search_url)
soup = BeautifulSoup(r.content)
data = soup.find_all("div",{"class":"img"})
images = []
for item in data:
try:
images.append(item.contents[0].find('img')['src'])
except:
images.append("img Unknown")
return images
if __name__ == "__main__":
searches = ['http://www.somewebsite1.com/?orange+cat', # or whatever
'http://www.somewebsite2.com/?blue+cat',
'http://www.somewebsite3.com/?green+cat']
pool = multiprocessing.Pool() # will create as many processes as you have CPU cores
results = pool.map(my_func, searches)
pool.close()
# do something with results, which will be a list with of the function return values