Python multiprocessing in for loop (requests and BeautifulSoup)

Python multiprocessing in for loop (requests and BeautifulSoup) - python

I have list of a lot of links and I want to use multiprocessing to speed the proccess, here is simplified version, I need it to be ordered like this:
I tried a lot of things, process, pool etc. I always had errors, I need to do it with 4 or 8 threads and make it ordered like this. Thank you for all help. Here is code:
from bs4 import BeautifulSoup
import requests
import time
links = ["http://www.tennisexplorer.com/match-detail/?id=1672704", "http://www.tennisexplorer.com/match-detail/?id=1699387", "http://www.tennisexplorer.com/match-detail/?id=1698990" "http://www.tennisexplorer.com/match-detail/?id=1696623", "http://www.tennisexplorer.com/match-detail/?id=1688719", "http://www.tennisexplorer.com/match-detail/?id=1686305"]
data = []
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
return p1_l + " - " + p2_l + " - " + str(omega)
i = 1
start_time = time.clock()
for link in links:
data.append(essa(link, i))
i += 1
for d in data:
print(d)
print(time.clock() - start_time, "seconds")

Spawn several threads of the function and join them together:
from threading import Thread
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
print p1_l + " - " + p2_l + " - " + str(omega)
if __name__ == '__main__':
threadlist = []
for index, url in enumerate(links):
t= Thread(target=essa,args=(url, index))
t.start()
threadlist.append(t)
for b in threadlist:
b.join()
You wont get them to print in order, for the simple reason that some http responses take longer than others.

As far I can understand you have the list of links and make requests concurrently to make the process faster. Here is the sample code for multithreading. I hope this will help you. Read the documentation for concurrent futures.
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))

Related

How do i make concurrent request to an APi endpoint in Python? Receiving error: expecting value: line 1 column 1 (char 0)

I have a python script that is sending concurrent requests to an API endpoint. When using the rick and morty API endpoint it works fine. But when I try to send it to my local Node server, it gives me the following error.
Looks like something went wrong: Expecting value: line 1 column 1 (char 0)
Does anyone know why? Code is below
import requests
import concurrent
from concurrent.futures import ThreadPoolExecutor
import time
characters = range(1, 10)
base_url = 'https://rickandmortyapi.com/api/character'
test_url = 'http://localhost:8080'
threads = 20
def get_character_info(character):
headers = {"Authorization": "Bearer MYREALLYLONGTOKENIGOT"}
r = requests.get(f'{test_url}/{character}')
return r.json()
with ThreadPoolExecutor(max_workers=threads) as executor:
start_time = time.time()
future_to_url = {executor.submit(get_character_info, char) for char in characters}
for future in concurrent.futures.as_completed(future_to_url):
try:
data = future.result()
print(data)
except Exception as e:
print('Looks like something went wrong:', e)
print("--- %s seconds ---" % (time.time() - start_time))
# 0.9544878005981445

Python requests.get and threading with different servers

I am working on a simple web scraper and rn trying to implement some multithreading. While my code works as intended with some servers(reducing time of execution vastly), my primary goal is to make it work with few specific ones. So when I try it with the ones in sites list, I get performance like I am still using sequential code. Any guesses what can cause this?
import requests, time
from bs4 import BeautifulSoup
from threading import Thread
from random import choice
# Enable to get some logging info
#---------------------------------
# import logging
# import http.client
# http.client.HTTPConnection.debuglevel = 1
# logging.basicConfig()
# logging.getLogger().setLevel(logging.DEBUG)
# requests_log = logging.getLogger("requests.packages.urllib3")
# requests_log.setLevel(logging.DEBUG)
# requests_log.propagate = True
sites = [
"https://pikabu.ru/community/blackhumour",
"https://www.pikabu.ru/tag/%D0%9C%D0%B5%D0%BC%D1%8B/hot"
]
class Pikabu_Downloader(Thread):
def __init__(self, url, name, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = url
self.name = name
self.begin = time.time()
def run(self):
print("Beginning with thread number",self.name, ",", round(time.time()-self.begin, 4), " seconds has passed")
html_data = self._get_html()
print("After requests.get with thread number", self.name, ",", round(time.time()-self.begin, 4), " seconds has passed")
if html_data is None:
return
self.soup = BeautifulSoup(html_data, "html.parser")
print("After making soup with thread number", self.name, ",", round(time.time() - self.begin, 4), " seconds has passed")
def _get_html(self):
try:
user_agents = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'AppleWebKit/537.36 (KHTML, like Gecko)', 'Chrome/74.0.3729.169', 'Safari/537.36')
print(f"Go {self.url}...")
res = requests.get(self.url, headers={'User-Agent': choice(user_agents)}, stream = True)#, allow_redirects=False)
except Exception as exc:
print(exc)
else:
return res.text
test = "https://readingbooks.site/read/?name=1984&"
def download():
pikabu_urls = []
for url in sites:
pikabu = [url + "?page=" + str(x) for x in range(1, 10)]
pikabu_urls = pikabu_urls + pikabu
pikabu_dls = [Pikabu_Downloader(url=page, name=str(i)) for i, page in enumerate(pikabu_urls)]
# Comment the string above and enable 2 underlying strings to get result from test server
# tests = [test + "page=" + str(x) for x in range(1, pages)]
# pikabu_dls = [Pikabu_Downloader(url=page, name=str(i)) for i, page in enumerate(tests)]
for pikabu_dl in pikabu_dls:
pikabu_dl.start()
for pikabu_dl in pikabu_dls:
pikabu_dl.join()
download()
And the result is something like
...
After requests.get with thread number 1 , 1.6904 seconds has passed
After making soup with thread number 1 , 1.7554 seconds has passed
After requests.get with thread number 2 , 2.9805 seconds has passed
After making soup with thread number 2 , 3.0455 seconds has passed
After requests.get with thread number 3 , 4.3225 seconds has passed
After making soup with thread number 3 , 4.3895 seconds has passed
...
What can cause such latency between thread executions? I was hoping to get each thread to finish almost simultaneously and to get more...asynchronous output, like with server from test. If I set a timeout of 5 sec inside requests.get, most of the requests wont even work.

After I investigated your case, I would point out some issues that you have encountered:
Do not print when it is on parallel tasks, it will cause the bottle-neck on the way of rendering to screen
The large of tasks are not always good for performance, it depends on how much your memory will process. Imagine that you have 1000 links, you have to create 1000 task objects? No, only place-holder for 5-20 by leveraging ThreadPool
Server also is a problem to deal with when taking request. Downloaded size, low bandwidth, network, distancing,.. caused response late will affect your physic machine. Your sites are weight, it seems consuming 1-3000ms each request so when you test it with small size (20 links), it makes you feel it runs sequentially
Your code is running parallel, since you do a little bit trick to put it on different threads, it is not quite right because we need a fully async library, such like asyncio and aiohttp. The aiohttp will take care numerous async requests on the Coroutine whereas asyncio will support syntax and operate on your main thread.
I did a small experiment on colab, please be noticed that I didn't use asyncio and aiohttp on colab because of stuck, but I have implemented on several projects before and it worked faster than below fastest method.
The second function is your implementation
import urllib.request
from threading import Thread
import time, requests
from random import choice
user_agents = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'AppleWebKit/537.36 (KHTML, like Gecko)', 'Chrome/74.0.3729.169', 'Safari/537.36')
timeout = 5
sites = [
"https://pikabu.ru/community/blackhumour",
"https://www.pikabu.ru/tag/%D0%9C%D0%B5%D0%BC%D1%8B/hot"
]
URLS = []
for url in sites:
pikabu = [url + "?page=" + str(x) for x in range(25)]
URLS.extend(pikabu)
def convert_to_threads():
return [Thread(target=load_url, args=(page, timeout)) for page in URLS]
def running_threads():
threads = convert_to_threads()
start = time.time()
for i in threads:
i.start()
for i in threads:
i.join()
print(f'Finish with {len(URLS)} requests {time.time() - start}')
def load_url(url, timeout):
res = requests.get(url, headers={'User-Agent': choice(user_agents)}, stream = True)#, allow_redirects=False)
return res.text
def running_sequence():
start = time.time()
for url in URLS:
load_url(url, timeout)
print(f'Finish with {len(URLS)} requests {time.time() - start}')
def running_thread_pool():
start = time.time()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, timeout): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
# else:
# print('%r page is %d length' % (url, len(data)))
print(f'Finish with {len(URLS)} requests {time.time() - start}')
In short, I recommend you use ThreadPool (prefer in colab), or asyncio and aiohttp (not in colab) to gain speed

How can I improve the aiohttp crawler speed?

import aiohttp
from bs4 import BeautifulSoup
from xlrd import open_workbook
from xlwt import Workbook
url_list = [https://www.facebook.com,https://www.baidu.com,https://www.yahoo.com,...]
#There are more than 20000 different websites in the list
#Some websites may not be accessible
keywords=['xxx','xxx'....]
start = time.time()
localtime = time.asctime(time.localtime(time.time()))
print("start time :", localtime)
choose_url=[]
url_title=[]
async def get(url, session):
try:
async with session.get(url=url,timeout=0) as response:
resp = await response.text()
soup = BeautifulSoup(resp, "lxml")
title = soup.find("title").text.strip()
for keyword in keywords:
if keyword in title:
choose_url.append(url)
url_title.append(title)
print("Successfully got url {} with resp's name {}.".format(url, title))
break
except Exception as e:
pass
async def main(urls):
connector = aiohttp.TCPConnector(ssl=False,limit=0,limit_per_host =0)
session = aiohttp.ClientSession(connector=connector)
ret = await asyncio.gather(*[get(url, session) for url in urls])
print("Finalized all. Return is a list of outputs.")
await session.close()
def write_exccel(choose_url,url_title):
#write choose_url,url_title to excel
pass
asyncio.run(main(url_list))
write_exccel(choose_url,url_title)
localtime = time.asctime(time.localtime(time.time()))
print("now time is :", localtime)
end = time.time()
print('time used：', end - start)
I have 20000 URLs to request. But it takes a long time (more than 4 or 5 hours).It just needs 3 hours if I use requests+multiprocessing(Pool 4).
I tried to use aiohttp+multiprocessing,It doesn't seem to work. Can the code be as fast as possible either by optimizing this code or using any available technology? Thanks

I don't know if the following method is fast or not.
import time
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
class MySpider(Spider):
name = 'demo_spider'
start_urls = ["https://www.facebook.com","https://www.baidu.com","https://www.yahoo.com"] # Entry page
keywords = ['xxx','xxx']
choose_url=[]
url_title=[]
concurrencyPer1s = 10
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
title = doc.title
if title.containsOr(self.keywords):
self.choose_url.append(url.url)
self.url_title.append(title.text)
print("Successfully got url {} with resp's name {}.".format(url, title.text))
def urlCount(self):
count = Spider.urlCount(self)
if count==0:
SimplifiedMain.setRunFlag(False)
return count
start = time.time()
localtime = time.asctime(time.localtime(time.time()))
print("start time :", localtime)
SimplifiedMain.startThread(MySpider(),{"concurrency":600, "concurrencyPer1S":100, "intervalTime":0.001, "max_workers":10}) # Start download
localtime = time.asctime(time.localtime(time.time()))
print("now time is :", localtime)
end = time.time()
print('time used：', end - start)

Python - Multithreaded Proxy Tester

I'm building a proxy checker using multithreads, specificly a thread pool from:
from multiprocessing.dummy import Pool as ThreadPool.
The http request is by using urllib2.
What I want to do is for each proxy run 20 requests. If it was 1 threaded it would take too much time. thats where the multithreads power comes to help. So once I set up the proxy I want to run those 20 requests, and manage 2 things. One is to count the exceptions and dump the proxy if too many occurs. 2nd Is to save the average response time and present it later.
I just don't manage to implement the above. But I have implemented it with 1 thread:
import socket
import ssl
import time
import urllib
import urllib2
import httplib
proxyList = []
def loadProxysFromFile(fileName):
global proxyList
with open(fileName) as f:
proxyList = [line.rstrip('\n') for line in f]
def setUrllib2Proxy(proxyAddress):
proxy = urllib2.ProxyHandler({
'http': "http://" + proxyAddress,
'https': "https://" + proxyAddress
})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
def timingRequest(proxy, url):
error = False
setUrllib2Proxy(proxy)
start = time.time()
try:
req = urllib2.Request(url)
urllib2.urlopen(req, timeout=5) #opening the request (getting a response)
except (urllib2.URLError, httplib.BadStatusLine, ssl.SSLError, socket.error) as e:
error = True
end = time.time()
timing = end - start
if error:
print "Error with proxy " + proxy
return 0
else:
print proxy + " Request to " + url + " took: %s" %timing + " seconds."
return timing
# Main
loadProxysFromFile("proxyList.txt")
for proxy in proxyList:
print "Testing: " + proxy
print "\n"
REQUEST_NUM = 20
ERROR_TOLERANCE_NUM = 3
resultList = []
for proxy in proxyList:
avgTime = 0
errorCount = 0
for x in range(0, REQUEST_NUM):
result = timingRequest(proxy, 'https://www.google.com')
if (result == 0):
errorCount += 1
if (errorCount >= ERROR_TOLERANCE_NUM):
break
else:
avgTime += result
if (errorCount < ERROR_TOLERANCE_NUM):
avgTime = avgTime/(REQUEST_NUM-errorCount)
resultList.append(proxy + " has an average response time of: %s" %avgTime)
print '\n'
print "Results Summery: "
print "-----------------"
for res in resultList:
print res
Things that must be done are:
for every proxy: wait until all 20 requests are over before changing proxy. Sync somehow the threads when they adding up to calculate the average response time (includes not to take in account the exceptions)
The best solutions I've read so far is using from multiprocessing.dummy import Pool as ThreadPool and pool.map(func, iterable) but I cant figure out how to implement it in my code.

Multi-threaded Python Web Crawler Got Stuck

I'm writing a Python web crawler and I want to make it multi-threaded. Now I have finished the basic part, below is what it does:
a thread gets a url from the queue;
the thread extracts the links from the page, checks if the links exist in a pool (a set), and puts the new links to the queue and the pool;
the thread writes the url and the http response to a csv file.
But when I run the crawler, it always gets stuck eventually, not exiting properly. I have gone through the official document of Python but still have no clue.
Below is the code:
#!/usr/bin/env python
#!coding=utf-8
import requests, re, urlparse
import threading
from Queue import Queue
from bs4 import BeautifulSoup
#custom modules and files
from setting import config
class Page:
def __init__(self, url):
self.url = url
self.status = ""
self.rawdata = ""
self.error = False
r = ""
try:
r = requests.get(self.url, headers={'User-Agent': 'random spider'})
except requests.exceptions.RequestException as e:
self.status = e
self.error = True
else:
if not r.history:
self.status = r.status_code
else:
self.status = r.history[0]
self.rawdata = r
def outlinks(self):
self.outlinks = []
#links, contains URL, anchor text, nofollow
raw = self.rawdata.text.lower()
soup = BeautifulSoup(raw)
outlinks = soup.find_all('a', href=True)
for link in outlinks:
d = {"follow":"yes"}
d['url'] = urlparse.urljoin(self.url, link.get('href'))
d['anchortext'] = link.text
if link.get('rel'):
if "nofollow" in link.get('rel'):
d["follow"] = "no"
if d not in self.outlinks:
self.outlinks.append(d)
pool = Queue()
exist = set()
thread_num = 10
lock = threading.Lock()
output = open("final.csv", "a")
#the domain is the start point
domain = config["domain"]
pool.put(domain)
exist.add(domain)
def crawl():
while True:
p = Page(pool.get())
#write data to output file
lock.acquire()
output.write(p.url+" "+str(p.status)+"\n")
print "%s crawls %s" % (threading.currentThread().getName(), p.url)
lock.release()
if not p.error:
p.outlinks()
outlinks = p.outlinks
if urlparse.urlparse(p.url)[1] == urlparse.urlparse(domain)[1] :
for link in outlinks:
if link['url'] not in exist:
lock.acquire()
pool.put(link['url'])
exist.add(link['url'])
lock.release()
pool.task_done()
for i in range(thread_num):
t = threading.Thread(target = crawl)
t.start()
pool.join()
output.close()
Any help would be appreciated!
Thanks
Marcus

Your crawl function has an infinite while loop with no possible exit path.
The condition True always evaluates to True and the loop continues, as you say,
not exiting properly
Modify the crawl function's while loop to include a condition. For instance, when the number of links saved to the csv file exceeds a certain minimum number, then exit the while loop.
i.e.,
def crawl():
while len(exist) <= min_links:
...

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python multiprocessing in for loop (requests and BeautifulSoup) - python

Related

How do i make concurrent request to an APi endpoint in Python? Receiving error: expecting value: line 1 column 1 (char 0)

Python requests.get and threading with different servers

How can I improve the aiohttp crawler speed?

Python - Multithreaded Proxy Tester

Multi-threaded Python Web Crawler Got Stuck

Categories

Resources