How do I make this faster with multi-processing? - python

returns=[]
for x in range(40,80):
url = f'https://www.mutualfundindia.com/MF/Performance/Details?id={x}'
r = requests.get(url)
tree = html.fromstring(r.content)
inception = tree.xpath('//*[#id="collPerformanceAnalysis"]/div/div[3]/div[7]')
for i in inception:
if i.text!=' ':
returns.append(str.strip(i.text))
This is currently taking ~60 seconds for 40 results. I saw online that I can make it faster with multiprocessing. I watched many videos but I couldnt get it to work. Please help

Here is a solution using multiprocessing.Pool. You can tune the cpu_count parameter further to find the sweet spot. For now, it will create processes as many as the number of available CPU cores in your machine.
import multiprocessing
# Other imports here...
returns = []
def callback(result):
returns.extend(result)
def f(x):
url = f'https://www.mutualfundindia.com/MF/Performance/Details?id={x}'
r = requests.get(url)
tree = html.fromstring(r.content)
inception = tree.xpath('//*[#id="collPerformanceAnalysis"]/div/div[3]/div[7]')
result = []
for i in inception:
if i.text != ' ':
result.append(str.strip(i.text))
return result
if __name__ == "__main__":
pool = multiprocessing.Pool(multiprocessing.cpu_count())
for i in range(40, 80):
pool.apply_async(f, args=(i,), callback=callback)
pool.close()
pool.join()
print(returns)

Related

Python: Serial, Threading, Multiprocessing. Which is the fastest way?

so I have a code that needs to do HTTP requests (let's say 1000). I approached it in 3 ways so far with 50 HTTP requests. The results and codes are below.
The fastest is the approach using Threads, issue is that I lose some data (from what I understood due to the GIL). My questions are the following:
My understanding it that the correct approach in this case is to use Multiprocessing. Is there any way I can improve the speed of that approach? Matching the Threading time would be great.
I would guess that the higher the amount of links I have, the more time the Serial and Threading approach would take, while the Multiprocessing approach would increase much more slowly. Do you have any source that will allow me to get an estimate of the time it would take to run the code with n links?
Serial - Time To Run around 10 seconds
def get_data(link, **kwargs):
data = requests.get(link)
if "queue" in kwargs and isinstance(kwargs["queue"], queue.Queue):
kwargs["queue"].put(data)
else:
return data
links = [link_1, link_2, ..., link_n]
matrix = []
for link in links:
matrix.append(get_data(link))
Threads - Time To Run around 0.8 of a second
def get_data_thread(links):
q = queue.Queue()
for link in links:
data = threading.Thread(target = get_data, args = (link, ), kwargs = {"queue" : q})
data.start()
data.join()
return q
matrix = []
q = get_data_thread(links)
while not q.empty():
matrix.append(q.get())
Multiprocessing - Time To Run around 5 seconds
def get_data_pool(links):
p = mp.Pool()
data = p.map(get_data, links)
return data
if __name__ == "__main__":
matrix = get_data_pool(links)
If I were to suggest anything, I would go with AIOHTTP. A sketch of the code:
import aiohttp
import asyncio
async def main(alink):
links = [link_1, link_2, ..., link_n]
matrix = []
async with aiohttp.ClientSession() as session:
async with session.get(alink) as resp:
return resp.data()
if __name__ == "__main__":
loop = asyncio.get_event_loop()
for link in links:
loop.run_until_complete(main(link))

Improve speed on current multiprocessing.Pool()

I have a json file with list of urls.
After reading documentation, I figured, multiproccessing.pool is the best option for me.
I ran 10 urls, with multiprocessing.Pool(10), I was expecting the results would pretty much be instant, but it takes me about 12 seconds to complete everything, not sure if I am using it correctly, but below is my code.
def download_site(data, outputArr, proxyArr=None):
session = requests.Session()
# print("Scraping last name {lastName}".format(lastName=data['lastName']))
userAgents = open('user-agents.txt').read().split('\n')
params = (
('Name', data['lastName']),
('type', 'P'),
)
url = 'someurl'
if not proxyArr:
proxyArr = {
'http': data['proxy']['http']
}
try:
with session.get(url, params=params, proxies=proxyArr, headers=headers) as response:
name = multiprocessing.current_process().name
try:
content = response.json()
loadJson = json.loads(content)['nameBirthDetails']
for case in loadJson:
dateFile = loadJson[case]['dateFiled']
year = int(dateFile.split('/')[-1])
if year > 2018:
profileDetailUrlParams = (
('caseId',loadJson[case]['caseYear']),
('caseType', 'WC'),
('caseNumber', loadJson[case]['caseSeqNbr']),
)
loadJson[case]['caseDetail'] = getAttorneyData(profileDetailUrlParams, session, proxyArr)
outputArr.append(loadJson[case])
# print("Total Scraped Results so far ", len(outputArr))
except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError):
print("Error Found JSON DECODE ERROR - passing for last name", data['lastName'])
except simplejson.errors.JSONDecodeError:
print("Found Simple Json Error", data['lastName'])
pass
# newProxy = generate_random_proxy()
# download_site(data, outputArr, newProxy)
except:
raise
def queueList(sites):
manager = multiprocessing.Manager()
outputArr = manager.list()
functionMain = partial(download_site, outputArr = outputArr)
p = multiprocessing.Pool(10)
records = p.map(functionMain, sites)
p.terminate()
p.join()
if __name__ == "__main__":
outputArr = []
fileData = json.loads(open('lastNamesWithProxy.json').read())[:10]
start_time = time.time()
queueList(fileData)
duration = time.time() - start_time
print(f"Downloaded {len(fileData)} in {duration} seconds")
The function download_site, is the function where I fetch a list via requests library - then for each item in the list, I make another requests aka function getAttorneyData
How can I further hone this to run faster? I have a high-end computer so CPU shouldn't be an issue, I want to use it to it's max potential.
My goal is to be able to spawn 10 workers and consume each worker with each request. So, 10 requests would really take me 1-2 seconds instead of 12, which is currently.

Apply multiprocessing pool to function which takes multiple lists as argument

I have the following function:
def match_keywords(reviews_match, nlu_match, keywords_match):
for j in range(df_NLU_Reviews.shape[0]):
if((j%1000)==0):
print(j)
keywords = df_NLU_Reviews.Keywords.iloc[j]
for i in range(len(sentences)):
try:
counter=0
for keyword in keywords:
if(keyword in sentences[i]):
counter+=1
if( (len(keywords)) == counter ):
reviews_match.append(sentences[i])
nlu_match.append(df_NLU_Reviews.NLU_Review.iloc[j])
keywords_match.append(df_NLU_Reviews.Keywords.iloc[j])
sentences.remove(sentences[i])
break
except Exception as e:
print(i)
print(j)
raise e
df_match = pd.DataFrame()
df_match['Reviews'] = reviews_match
df_match['NLU'] = nlu_match
df_match['Keywords'] = keywords_match
df_match.to_pickle("Match_Reviews.pkl")
return df_match
This function takes 3 empty lists as arguments that will be filled during the execution of the function.
I want to parallelize using multiprocessing.Pool, but i can't figure out how to do it.
I have tried this:
reviews_match = []
nlu_match = []
keywords_match = []
match_list = [reviews_match, nlu_match, keywords_match]
if __name__ == '__main__':
with Pool(processes = 12) as pool:
results = pool.map(match_keywords, zip(reviews_match, nlu_match, keywords_match))
print(results)
this:
reviews_match = []
nlu_match = []
keywords_match = []
match_list = [reviews_match, nlu_match, keywords_match]
if __name__ == '__main__':
with Pool(processes = 12) as pool:
results = pool.map(match_keywords, zip(match_list))
print(results)
and this too:
reviews_match = []
nlu_match = []
keywords_match = []
match_list = [reviews_match, nlu_match, keywords_match]
if __name__ == '__main__':
with Pool(processes = 12) as pool:
results = pool.starmap(match_keywords, zip(reviews_match, nlu_match, keywords_match))
print(results)
But none of this works, these methods throw errors or empty lists as output. If i run this function without parallelization like this:
match_keywords(reviews_match, nlu_match, keywords_match)
It works just fine. Could someone please show me the right way of doing this and explain to me why this is not working?
Thank you very much in advance
You last variant look correct and will not return empty list, check it again.
But i think you can't parallelise it this way, because it is not equal to run
match_keywords(reviews_match, nlu_match, keywords_match)
in many threads, it is equal to run:
match_keywords(reviews_match[0], nlu_match[0], keywords_match[0])
match_keywords(reviews_match[1], nlu_match[1], keywords_match[1])
match_keywords(reviews_match[2], nlu_match[2], keywords_match[2])
match_keywords(reviews_match[3], nlu_match[3], keywords_match[3])
...
many times.

Multiprocessing list search

So I'm using processes and a queue to search through data and find the rows that have the same entry in a different columns. I decided to use multiprocessing to try and make it so can be scaled for large data. The file has a 1000 lines and 10 points of data per line. I read in only 80 lines of the data and the program stalls. 70 lines and it works fine and at a decent speed too.
My question is what am I doing wrong or are the limitations with this approach that I haven't identified? The code isn't perfect by any means and is probably bad in itself. The code is as follows:
from multiprocessing import Process, Queue
import random
def openFile(file_name, k, division):
i = 0
dataSet = []
with open(file_name) as f:
for line in f:
stripLine = line.strip('\n')
splitLine = stripLine.split(division)
dataSet += [splitLine]
i += 1
if(i == k):
break
return(dataSet)
def setCombination(q,data1,data2):
newData = []
for i in range(0,len(data1)):
for j in range(0, len(data2)):
if(data1[i][1] == data2[j][3]):
newData += data2[j]
q.put(newData)
if __name__ == '__main__':
# Takes in the file, the length of the data to read in, and how the data is divided.
data = openFile('testing.txt', 80, ' ')
for i in range(len(data)):
for j in range(len(data[i])):
try:
data[i][j] = float(data[i][j])
except ValueError:
pass
#print(data)
k = len(data)//10
q = Queue()
processes = [Process(target=setCombination, args=(q, data[k*x: k + k*x], data))
for x in range(10)]
for p in processes:
p.start()
# Exit the completed processes
for p in processes:
p.join()
saleSet = [q.get() for p in processes]
print('\n', saleSet)
The data file testing.txt
It appears that something about what your code does is causing a deadlock. While experimenting, I noticed that 3 out of the 10 tasks would never terminate, but, to be honest, I don't really don't know the reason(s) why.
The good news is it's easy to fix by just removing or disabling the
# Exit the completed processes
for p in processes:
p.join()
loop you have in your code.
Here's a complete version of your code with (mostly) just that modification in it:
from multiprocessing import Process, Queue
def openFile(file_name, k, division):
i = 0
dataSet = []
with open(file_name) as f:
for line in f:
stripLine = line.strip('\n')
splitLine = stripLine.split(division)
dataSet += [splitLine]
i += 1
if i == k:
break
return dataSet
def setCombination(q, data1, data2):
newData = []
for i in range(len(data1)):
for j in range(len(data2)):
if data1[i][1] == data2[j][3]:
newData += data2[j]
q.put(newData)
if __name__ == '__main__':
# Takes in the file, the length of the data to read in, and how the data is divided.
data = openFile('testing.txt', 80, ' ')
for i in range(len(data)):
for j in range(len(data[i])):
try:
data[i][j] = float(data[i][j])
except ValueError:
pass
k = len(data) // 10
q = Queue()
processes = [Process(target=setCombination, args=(q, data[k*x: k*x+k], data))
for x in range(10)]
for p in processes:
p.start()
# NO LONGER USED (HANGS)
# # Exit the completed processes
# for p in processes:
# p.join()
# note: this works since by default, get() will block until it can retrieve something
saleSet = [q.get() for _ in processes] # a queue item should be added by each Process
print('\n', saleSet)

multiprocessing does not save data

My program essentially scrapes images off of websites that I made up. I have 3 functions and each of them scrape images off of a specific website by using a parameter. My program contains the following code.
import requests
from bs4 import BeautifulSoup
from multiprocessing import Process
img1 = []
img2 = []
img3 = []
def my_func1(img_search):
del img1[:]
url1 = "http://www.somewebsite.com/" + str(img_search)
r1 = requests.get(url1)
soup1 = BeautifulSoup(r1.content)
data1 = soup1.find_all("div",{"class":"img"})
for item in data1:
try:
img1.append(item.contents[0].find('img')['src'])
except:
img1.append("img Unknown")
return
def my_func2(img_search):
del img2[:]
url2 = "http://www.somewebsite2.com/" + str(img_search)
r2 = requests.get(url2)
soup2 = BeautifulSoup(r2.content)
data2 = soup2.find_all("div",{"class":"img"})
for item in data2:
try:
img2.append(item.contents[0].find('img')['src'])
except:
img2.append("img Unknown")
return
def my_func3(img_search):
del img3[:]
url3 = "http://www.somewebsite3.com/" + str(img_search)
r3 = requests.get(url3)
soup3 = BeautifulSoup(r3.content)
data3 = soup3.find_all("div",{"class":"img"})
for item in data3:
try:
img3.append(item.contents[0].find('img')['src'])
except:
img3.append("img Unknown")
return
my_func1("orange cat")
my_func2("blue cat")
my_func3("green cat")
print(*img1, sep='\n')
print(*img2, sep='\n')
print(*img3, sep='\n')
The scraping works just fine, but it is quite slow so I decided to use multiprocessing to speed it up, and multiprocessing did in fact speed it up. I essentially replaced the function calls with this
p = Process(target=my_func1, args=("orange cat",))
p.start()
p2 = Process(target=my_func2, args=("blue cat",))
p2.start()
p3 = Process(target=my_func3, args=("green cat",))
p3.start()
p.join()
p2.join()
p3.join()
However, when I print the img1 , img2, and img3 lists they are empty. How would I fix this?
When you use multiprocessing to distribute your work between several processes, each process will run in a separate namespace (a copy of the namespace of the main process). The changes you make in the child-process's namespace will not be reflected in the parent process's namespace. You'll need to use a multiprocessing.Queue or some other synchronization method to pass the data back from the worker processes.
In your example code, your three functions are almost exactly the same, only the web site's domain and the variable names differ. If that's how your real functions look, I suggest using multiprocessing.Pool.map and passing the whole URL to a single function, rather than just passing search terms:
def my_func(search_url):
r = requests.get(search_url)
soup = BeautifulSoup(r.content)
data = soup.find_all("div",{"class":"img"})
images = []
for item in data:
try:
images.append(item.contents[0].find('img')['src'])
except:
images.append("img Unknown")
return images
if __name__ == "__main__":
searches = ['http://www.somewebsite1.com/?orange+cat', # or whatever
'http://www.somewebsite2.com/?blue+cat',
'http://www.somewebsite3.com/?green+cat']
pool = multiprocessing.Pool() # will create as many processes as you have CPU cores
results = pool.map(my_func, searches)
pool.close()
# do something with results, which will be a list with of the function return values

Categories

Resources