multiprocessing does not save data - python

My program essentially scrapes images off of websites that I made up. I have 3 functions and each of them scrape images off of a specific website by using a parameter. My program contains the following code.
import requests
from bs4 import BeautifulSoup
from multiprocessing import Process
img1 = []
img2 = []
img3 = []
def my_func1(img_search):
del img1[:]
url1 = "http://www.somewebsite.com/" + str(img_search)
r1 = requests.get(url1)
soup1 = BeautifulSoup(r1.content)
data1 = soup1.find_all("div",{"class":"img"})
for item in data1:
try:
img1.append(item.contents[0].find('img')['src'])
except:
img1.append("img Unknown")
return
def my_func2(img_search):
del img2[:]
url2 = "http://www.somewebsite2.com/" + str(img_search)
r2 = requests.get(url2)
soup2 = BeautifulSoup(r2.content)
data2 = soup2.find_all("div",{"class":"img"})
for item in data2:
try:
img2.append(item.contents[0].find('img')['src'])
except:
img2.append("img Unknown")
return
def my_func3(img_search):
del img3[:]
url3 = "http://www.somewebsite3.com/" + str(img_search)
r3 = requests.get(url3)
soup3 = BeautifulSoup(r3.content)
data3 = soup3.find_all("div",{"class":"img"})
for item in data3:
try:
img3.append(item.contents[0].find('img')['src'])
except:
img3.append("img Unknown")
return
my_func1("orange cat")
my_func2("blue cat")
my_func3("green cat")
print(*img1, sep='\n')
print(*img2, sep='\n')
print(*img3, sep='\n')
The scraping works just fine, but it is quite slow so I decided to use multiprocessing to speed it up, and multiprocessing did in fact speed it up. I essentially replaced the function calls with this
p = Process(target=my_func1, args=("orange cat",))
p.start()
p2 = Process(target=my_func2, args=("blue cat",))
p2.start()
p3 = Process(target=my_func3, args=("green cat",))
p3.start()
p.join()
p2.join()
p3.join()
However, when I print the img1 , img2, and img3 lists they are empty. How would I fix this?

When you use multiprocessing to distribute your work between several processes, each process will run in a separate namespace (a copy of the namespace of the main process). The changes you make in the child-process's namespace will not be reflected in the parent process's namespace. You'll need to use a multiprocessing.Queue or some other synchronization method to pass the data back from the worker processes.
In your example code, your three functions are almost exactly the same, only the web site's domain and the variable names differ. If that's how your real functions look, I suggest using multiprocessing.Pool.map and passing the whole URL to a single function, rather than just passing search terms:
def my_func(search_url):
r = requests.get(search_url)
soup = BeautifulSoup(r.content)
data = soup.find_all("div",{"class":"img"})
images = []
for item in data:
try:
images.append(item.contents[0].find('img')['src'])
except:
images.append("img Unknown")
return images
if __name__ == "__main__":
searches = ['http://www.somewebsite1.com/?orange+cat', # or whatever
'http://www.somewebsite2.com/?blue+cat',
'http://www.somewebsite3.com/?green+cat']
pool = multiprocessing.Pool() # will create as many processes as you have CPU cores
results = pool.map(my_func, searches)
pool.close()
# do something with results, which will be a list with of the function return values

Related

How do I make this faster with multi-processing?

returns=[]
for x in range(40,80):
url = f'https://www.mutualfundindia.com/MF/Performance/Details?id={x}'
r = requests.get(url)
tree = html.fromstring(r.content)
inception = tree.xpath('//*[#id="collPerformanceAnalysis"]/div/div[3]/div[7]')
for i in inception:
if i.text!=' ':
returns.append(str.strip(i.text))
This is currently taking ~60 seconds for 40 results. I saw online that I can make it faster with multiprocessing. I watched many videos but I couldnt get it to work. Please help
Here is a solution using multiprocessing.Pool. You can tune the cpu_count parameter further to find the sweet spot. For now, it will create processes as many as the number of available CPU cores in your machine.
import multiprocessing
# Other imports here...
returns = []
def callback(result):
returns.extend(result)
def f(x):
url = f'https://www.mutualfundindia.com/MF/Performance/Details?id={x}'
r = requests.get(url)
tree = html.fromstring(r.content)
inception = tree.xpath('//*[#id="collPerformanceAnalysis"]/div/div[3]/div[7]')
result = []
for i in inception:
if i.text != ' ':
result.append(str.strip(i.text))
return result
if __name__ == "__main__":
pool = multiprocessing.Pool(multiprocessing.cpu_count())
for i in range(40, 80):
pool.apply_async(f, args=(i,), callback=callback)
pool.close()
pool.join()
print(returns)

Improve speed on current multiprocessing.Pool()

I have a json file with list of urls.
After reading documentation, I figured, multiproccessing.pool is the best option for me.
I ran 10 urls, with multiprocessing.Pool(10), I was expecting the results would pretty much be instant, but it takes me about 12 seconds to complete everything, not sure if I am using it correctly, but below is my code.
def download_site(data, outputArr, proxyArr=None):
session = requests.Session()
# print("Scraping last name {lastName}".format(lastName=data['lastName']))
userAgents = open('user-agents.txt').read().split('\n')
params = (
('Name', data['lastName']),
('type', 'P'),
)
url = 'someurl'
if not proxyArr:
proxyArr = {
'http': data['proxy']['http']
}
try:
with session.get(url, params=params, proxies=proxyArr, headers=headers) as response:
name = multiprocessing.current_process().name
try:
content = response.json()
loadJson = json.loads(content)['nameBirthDetails']
for case in loadJson:
dateFile = loadJson[case]['dateFiled']
year = int(dateFile.split('/')[-1])
if year > 2018:
profileDetailUrlParams = (
('caseId',loadJson[case]['caseYear']),
('caseType', 'WC'),
('caseNumber', loadJson[case]['caseSeqNbr']),
)
loadJson[case]['caseDetail'] = getAttorneyData(profileDetailUrlParams, session, proxyArr)
outputArr.append(loadJson[case])
# print("Total Scraped Results so far ", len(outputArr))
except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError):
print("Error Found JSON DECODE ERROR - passing for last name", data['lastName'])
except simplejson.errors.JSONDecodeError:
print("Found Simple Json Error", data['lastName'])
pass
# newProxy = generate_random_proxy()
# download_site(data, outputArr, newProxy)
except:
raise
def queueList(sites):
manager = multiprocessing.Manager()
outputArr = manager.list()
functionMain = partial(download_site, outputArr = outputArr)
p = multiprocessing.Pool(10)
records = p.map(functionMain, sites)
p.terminate()
p.join()
if __name__ == "__main__":
outputArr = []
fileData = json.loads(open('lastNamesWithProxy.json').read())[:10]
start_time = time.time()
queueList(fileData)
duration = time.time() - start_time
print(f"Downloaded {len(fileData)} in {duration} seconds")
The function download_site, is the function where I fetch a list via requests library - then for each item in the list, I make another requests aka function getAttorneyData
How can I further hone this to run faster? I have a high-end computer so CPU shouldn't be an issue, I want to use it to it's max potential.
My goal is to be able to spawn 10 workers and consume each worker with each request. So, 10 requests would really take me 1-2 seconds instead of 12, which is currently.

Conflicting functions using multiprocessing package

I have a function executed in parallel using the multiprocessing package, while another one is executed in the main process. The parallel function runs until the main one is finished. Both of them work, but not together. Both functions only share one argument in common (a global variable), and both use the BeautifulSoup package. Also, the functions have internal local variables which have the same name, but I don't think this is a problem.
The declaration of the functions is:
stop_event, output = mp.Event(), mp.Queue()
count_price_change = mp.Process(target=count, args=(stop_event, output, data_2, header_2, driver, lancer))
count_price_change.start()
start = time.time()
collected = create_array_features(data_1, header_1, prices, change, changepct, volume, lancer)
#PROBLEM IS HERE, eventually with lancer ??
stop = time.time()
time.sleep(frequency_get - (stop - start))
stop_event.set()
counts_changes = output.get()
count_price_change.join()
count_price_change.terminate()
The exact error I get is:
Traceback (most recent call last):
File "/usr/lib/python3.6/code.py", line 91, in runcode
exec(code, self.locals)
File "<input>", line 6, in <module>
File "<input>", line 80, in create_array_features
File "/usr/local/lib/python3.6/dist-packages/bs4/__init__.py", line 300, in __init__
markup, from_encoding, exclude_encodings=exclude_encodings)):
File "/usr/local/lib/python3.6/dist-packages/bs4/builder/_htmlparser.py", line 240, in prepare_markup
exclude_encodings=exclude_encodings)
File "/usr/local/lib/python3.6/dist-packages/bs4/dammit.py", line 374, in __init__
for encoding in self.detector.encodings:
File "/usr/local/lib/python3.6/dist-packages/bs4/dammit.py", line 265, in encodings
self.markup, self.is_html)
File "/usr/local/lib/python3.6/dist-packages/bs4/dammit.py", line 323, in find_declared_encoding
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
TypeError: expected string or bytes-like object
I have tried minimum reproducible examples but they all work. I am unable to figure out why I get an error. The whole code is (start at the bottom):
def parse_html(names, driver):
# need to make it wait until elements appear otherwise doesn't catch anything
list_names = []
for i in range(len(names)):
html_name = names[i].get_attribute('innerHTML')
soup = BeautifulSoup(html_name, "html.parser")
match = soup.find('a').text
list_names.append(match)
lancer = datetime.datetime.now().replace(microsecond=0).isoformat(' ')
prices = driver.find_elements_by_css_selector('div.item-field.item-price')
change = driver.find_elements_by_css_selector('div.item-field.item-change.pn--color')
changepct = driver.find_elements_by_css_selector('div.item-field.item-changepct.pn--color')
volume = driver.find_elements_by_css_selector('div.item-field.item-volume.is-made')
return list_names, lancer, prices, change, changepct, volume
def create_array_features(data, header, prices, change, changepct, volume, lancer):
# create an array of features
try:
list_to_parse = data.columns.get_level_values(0).unique()
except:
list_to_parse = data.get_level_values(0).unique() #this is for when the dataframe is empty
features = []
for i in range(len(list_to_parse)):
# prices
html = prices[i].get_attribute('innerHTML')
soup = BeautifulSoup(html, "html.parser")
soup = [text for text in soup.find_all(text=True) if text.parent.name != "label"][1]
match = prq.check_unit(soup)
features.append(match)
#parse some other stuff...
collected = pd.DataFrame([features], columns=header)
collected['time'] = pd.to_datetime(lancer)
collected = collected.set_index('time')
return collected
def count(stop_event, output, data_2, header_2, driver, lancer):
try:
list_to_parse = data_2.columns.get_level_values(0).unique()
except:
list_to_parse = data_2.get_level_values(0).unique() #this is for when the dataframe is empty
change_count_prices = [0 for i in range(len(list_to_parse))]
change_count_volume = [0 for i in range(len(list_to_parse))]
last_prices = [None for i in range(len(list_to_parse))]
last_volumes = [None for i in range(len(list_to_parse))]
while not stop_event.is_set():
WebDriverWait(driver, 60).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'section.watchlist__container.expanded')))
names_ = driver.find_elements_by_css_selector('div.item-symbol-inner')
_, _, prices, _, _, volume = parse_html(names_, driver) #each time must update the price in order to check if has changed
for i in range(len(list_to_parse)):
html = prices[i].get_attribute('innerHTML')
soup = BeautifulSoup(html, "html.parser")
soup = [text for text in soup.find_all(text=True) if text.parent.name != "label"][1]
match = prq.check_unit(soup)
if match != last_prices[i]:
change_count_prices[i] =+ 1
last_prices[i] = match
#do some other stuff
change_count_prices.extend(change_count_volume)
print("exited and will stop")
collected = pd.DataFrame([change_count_prices], columns=header_2)
collected['time'] = pd.to_datetime(lancer)
collected = collected.set_index('time')
output.put(collected) #LIFO queue with ouput that comes at the end when event is set
def collect(data_1, data_2, header_1, header_2, frequency_get, driver,
lancer, prices, change, changepct, volume):
stop_event, output = mp.Event(), mp.Queue()
count_price_change = mp.Process(target=count, args=(stop_event, output, data_2, header_2, driver, lancer))
count_price_change.start()
start = time.time()
collected = create_array_features(data_1, header_1, prices, change, changepct, volume, lancer)
stop = time.time()
time.sleep(frequency_get - (stop - start))
stop_event.set()
counts_changes = output.get()
count_price_change.join()
count_price_change.terminate()
Because i cant recreate your problem , generally i will suggest not to use shared variables between processes ,except you know the state of each every time.Also you need always to have the 2 functions or scripts in the same directory , because many fork issues can show up like errors.
So generally you can solve issues like that by communicating between main and child processes like this with multyprocessing Pipe(),and share real-time the state of the variable:
parent_conn, child_conn = Pipe()
p = Process(target = plot_map_function,args =(self.variable1,self.variable2,child_conn))
p.daemon=True
p.start()
parent_conn.recv()
p.join()
in other function :
def plot_map_function(var1,var2,conn_to_parent):
#do something with variables
conn_to_parent.send(variable_i_want_to_send)
conn_to_parent.close()
Hope it helps
In the end, I was able to find the answer. The problem comes from the fact that, Selenium cannot handle multiple tabs in parallel, having the same driver. In order to use Selenium in multithreading, you must use one different driver per thread. Which is not was I was making above

Issues getting Python Multiprocessing Library Working when using the Process Function

I'm trying to build a list of parent/comment pairs from the publicly available Reddit data set.
I have a CSV file which I load into a Pandas dataframe which contains rows of the comments with the parent and child id, as well as the child comment. The data is loaded using the following block of code:
import os
import multiprocessing as mp
import numpy as np
import pandas as pd
sourcePATH = r'C:\'
workingFILE = r'\output-pt1.csv'
# filepaths
input_file = sourcePATH + workingFILE
data_df = pd.read_csv(input_file,header=None,names=['PostIDX','ParentIDX','Comment','Score','Controversiality'])
The aim is to scan through each row in the dataframe and using the parent id to search through the rest of the dataframe to see if their is a parent comment present. If it is I then store the child and parent comments in a tuple with some other information. This will then be added to a list which will then be written out to a csv file at the end. To do this I use the following code:
def checkChildParent(ParentIDX_curr, ChildIDX_curr,ChildComment_curr,ChildScore_curr,ChildCont_curr):
idx = data_df.loc[data_df['PostIDX'] == ParentIDX_curr]
if idx.empty is False:
ParentComment = idx.iloc[0,2]
ParentScore = idx.iloc[0,3]
ParentCont = idx.iloc[0,4]
outPut.put([ParentIDX_curr[0], ParentComment,ParentScore,ParentCont,ChildIDX_curr[0], ChildComment_curr[0],ChildScore_curr[0],ChildCont_curr[0]])
if __name__ == '__main__':
print('Process started')
t_start_init = time.time()
t_start = time.time()
noCores = 1
#pool = mp.Pool(processes=noCores)
update_freq = 100
n = 1000
#n = round(len(data_df)/8)
flag_create = 0
flag_run = 0
i = 0
outPut = mp.Queue()
#parent_child_df = pd.DataFrame()
#parent_child_df.coumns = ['PostIDX','ParentIDX']
while i < n:
#print(i)
procs = []
ParentIDX = []
ParentComment = []
ParentScore = []
ParentCont = []
ChildIDX = []
ChildComment = []
ChildScore = []
ChildCont = []
for worker in range(0,noCores):
ParentIDX.append(data_df.iloc[i,1])
ChildIDX.append(data_df.iloc[i,0])
ChildComment.append(data_df.iloc[i,2])
ChildScore.append(data_df.iloc[i,3])
ChildCont.append(data_df.iloc[i,4])
i = i + 1
#when I call the function this way it returns the expected matches
#checkChildParent(ParentIDX,ChildIDX,ChildComment,
# ChildScore,ChildCont)
#when I call the function with Process function nothing appears to be happening
for proc in range(0,noCores):
p = mp.Process(target = checkChildParent, args=(ParentIDX[proc],ChildIDX[proc],ChildComment[proc],ChildScore[proc],ChildCont[proc]))
procs.append(p)
p.start()
#for p in procs:
# p.join()
if outPut.empty() is False:
print(outPut.get())
At the top of the file is a function which scans the dataframe for a given row and returns the tuple of the matched parent and child comment if it was found. If I call this function normally then it works fine, however when I call the function using the Process function it doesn't match anything!. I'm guessing its the form the arguments that are being passed to the function is being passed to the function that is causing the issue, but I have been trying to debug this all afternoon and have failed so far. If anyone has any suggestions then please let me know!
Thanks!

Python Multiprocessing Process causes Parent to idle

My question is very similar to this question here, except the solution with catching didn't quite work for me.
Problem: I'm using multiprocessing to handle a file in parallel. Around 97%, it works. However, sometimes, the parent process will idle forever and CPU usage shows 0.
Here is a simplified version of my code
from PIL import Image
import imageio
from multiprocessing import Process, Manager
def split_ranges(min_n, max_n, chunks=4):
chunksize = ((max_n - min_n) / chunks) + 1
return [range(x, min(max_n-1, x+chunksize)) for x in range(min_n, max_n, chunksize)]
def handle_file(file_list, vid, main_array):
for index in file_list:
try:
#Do Stuff
valid_frame = Image.fromarray(vid.get_data(index))
main_array[index] = 1
except:
main_array[index] = 0
def main(file_path):
mp_manager = Manager()
vid = imageio.get_reader(file_path, 'ffmpeg')
num_frames = vid._meta['nframes'] - 1
list_collector = mp_manager.list(range(num_frames)) #initialize a list as the size of number of frames in the video
total_list = split_ranges(10, min(200, num_frames), 4) #some arbitrary numbers between 0 and num_frames of video
processes = []
file_readers = []
for split_list in total_list:
video = imageio.get_reader(file_path, 'ffmpeg')
proc = Process(target=handle_file, args=(split_list, video, list_collector))
print "Started Process" #Always gets printed
proc.Daemon = False
proc.start()
processes.append(proc)
file_readers.append(video)
for i, proc in enumerate(processes):
proc.join()
print "Join Process " + str(i) #Doesn't get printed
fd = file_readers[i]
fd.close()
return list_collector
The issue is that I can see the processes starting and I can see that all of the items are being handled. However, sometimes, the processes don't rejoin. When I check back, only the parent process is there but it's idling as if it's waiting for something. None of the child processes are there, but I don't think join is called because my print statement doesn't show up.
My hypothesis is that this happens to videos with a lot of broken frames. However, it's a bit hard to reproduce this error because it rarely occurs.
EDIT: Code should be valid now. Trying to find a file that can reproduce this error.

Categories

Resources