Simple web crawler very slow - python

I have built a very simple web crawler to crawl ~100 small json files in the URL below. The issue is that the crawler takes more than an hour to complete. I find that hard to understand given how small the json files are. Am I doing something fundamentally wrong here?
def get_senate_vote(vote):
URL = 'https://www.govtrack.us/data/congress/113/votes/2013/s%d/data.json' % vote
response = requests.get(URL)
json_data = json.loads(response.text)
return json_data
def get_all_votes():
all_senate_votes = []
URL = "http://www.govtrack.us/data/congress/113/votes/2013"
response = requests.get(URL)
root = html.fromstring(response.content)
for a in root.xpath('/html/body/pre/a'):
link = a.xpath('text()')[0].strip()
if link[0] == 's':
vote = int(link[1:-1])
try:
vote_json = get_senate_vote(vote)
except:
return all_senate_votes
all_senate_votes.append(vote_json)
return all_senate_votes
vote_data = get_all_votes()

Here is a rather simple code sample, I've calculated the time taken for each call. On my system its taking on an average 2 secs per request, and there are 582 pages to visit, so around 19 mins without printing the JSON to the console. In your case network time plus print time may increase it.
#!/usr/bin/python
import requests
import re
import time
def find_votes():
r=requests.get("https://www.govtrack.us/data/congress/113/votes/2013/")
data = r.text
votes = re.findall('s\d+',data)
return votes
def crawl_data(votes):
print("Total pages: "+str(len(votes)))
for x in votes:
url ='https://www.govtrack.us/data/congress/113/votes/2013/'+x+'/data.json'
t1=time.time()
r=requests.get(url)
json = r.json()
print(time.time()-t1)
crawl_data(find_votes())

If you are using python 3.x and you are crawling multiple sites, for even better performances I offer warmly to you to use the aiohttp module, which implements the asynchronous principles.
For example:
import aiohttp
import asyncio
sites = ['url_1', 'url_2']
results = []
def save_reponse(result):
site_content = result.result()
results.append(site_content)
async def crawl_site(site):
async with aiohttp.ClientSession() as session:
async with session.get(site) as resp:
resp = await resp.text()
return resp
tasks = []
for site in sites:
task = asyncio.ensure_future(crawl_site(site))
task.add_done_callback(save_reponse)
tasks.append(task)
all_tasks = asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
loop.run_until_complete(all_tasks)
loop.close()
print(results)
For more reading about aiohttp.

Related

Webscraping 1000's of links using Python concurrent.futures

I am trying to scrape data from about 1000's of links which have the same content and the same procedure to extract data. To speed up the process I am using the python's concurrent.futures, which I think is the best in terms of speed. When I scrape data from about 30 - 40 links as a trial, it works; but as the number increases it does not. Here is my code:
import re
import json
import requests
import concurrent.futures
import time
links_json = ['https://webgate.ec.europa.eu/rasff-window/backend/public/notification/view/id/485387/',
'https://webgate.ec.europa.eu/rasff-window/backend/public/notification/view/id/485256/',
'https://webgate.ec.europa.eu/rasff-window/backend/public/notification/view/id/487113/',
'https://webgate.ec.europa.eu/rasff-window/backend/public/notification/view/id/486733/',
'https://webgate.ec.europa.eu/rasff-window/backend/public/notification/view/id/486937/',
'https://webgate.ec.europa.eu/rasff-window/backend/public/notification/view/id/486946/',
'https://webgate.ec.europa.eu/rasff-window/backend/public/notification/view/id/485444/',
'https://webgate.ec.europa.eu/rasff-window/backend/public/notification/view/id/487258/',
'https://webgate.ec.europa.eu/rasff-window/backend/public/notification/view/id/487011/',
'https://webgate.ec.europa.eu/rasff-window/backend/public/notification/view/id/487254/']
MAX_THREADS = 30
Data_Source = "RASFF"
Product_Category = []
Date = []
Product_name = []
Reference = []
def scrape(links):
data = requests.get(links).json()
Product_Category.append(data["product"]["productCategory"]["description"])
Date.append(data["ecValidationDate"])
Product_name.append(data["product"]["description"])
Reference.append(data["reference"])
def download_data(links_json):
threads = min(MAX_THREADS, len(links_json))
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
executor.map(scrape, links_json)
def main(new_links):
t0 = time.time()
download_data(new_links)
t1 = time.time()
print(f"{t1-t0} seconds to crawl {len(new_links)} in total.")
main(links_json)
When I try to run the main function, it is very inconsistent. Also right now there are only 12 links to scrape but as the links increase the data that should be extracted in the list also decreases. For instance: if there are about 200 links, there should be 200 values in the Product_category list but there are sometimes 100, 67 etc., meaning it is very inconsistent. I am not sure if I am missing something. I have even tried adding the time.sleep(0.25) in the scrape function but it does not work. I don't know how I can provide a list of 500 - 1000 links here.
Here's an example of how one could do this using the threading module:-
import requests
import threading
Product_Category = []
Date = []
Product_name = []
Reference = []
AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
BASEURL = 'https://webgate.ec.europa.eu/rasff-window/backend/public/notification/view/id/'
LOCK = threading.Lock()
headers = {'User-Agent': AGENT}
links = ['485387',
'485256',
'487113',
'486733',
'486937',
'486946',
'485444',
'487258',
'487011',
'487254']
def scrape(session, link):
response = session.get(f'{BASEURL}{link}/', headers=headers)
response.raise_for_status()
json = response.json()
try:
LOCK.acquire()
Product_Category.append(
json["product"]["productCategory"]["description"])
Date.append(json["ecValidationDate"])
Product_name.append(json["product"]["description"])
Reference.append(json["reference"])
finally:
LOCK.release()
def main():
with requests.Session() as session:
ta = []
for link in links:
t = threading.Thread(target=scrape, args=(session, link))
ta.append(t)
t.start()
for t in ta:
t.join()
print(Product_Category)
print(Date)
print(Product_name)
print(Reference)
if __name__ == '__main__':
main()

Can't Stop ThreadPoolExecutor

I'm scraping hundreds of urls, each with a leaderboard of data I want, and the only difference between each url string is a 'platform','region', and lastly, the page number. There are only a few platforms and regions, but the page numbers change each day and I don't know how many there are. So that's the first function, I'm just creating lists of urls to be requested in parallel.
If I use page=1, then the result will contain 'table_rows > 0' in the last function. But around page=500, the requested url still pings back but very slowly and then it will show an error message, no leaderboard found, the last function will show 'table_rows == 0', etc. The problem is I need to get through the very last page and I want to do this quickly, hence the threadpoolexecutor - but I can't cancel all the threads or processes or whatever once PAGE_LIMIT is tripped. I threw the executor.shutdown(cancel_futures=True) just to kind of show what I'm looking for. If nobody can help me I'll miserably remove the parallelization and I'll scrape slowly, sadly, one url at a time...
Thanks
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import pandas
import requests
PLATFORM = ['xbl', 'psn', 'atvi', 'battlenet']
REGION = ['us', 'ca']
PAGE_LIMIT = True
def leaderboardLister():
global REGION
global PLATFORM
list_url = []
for region in REGION:
for platform in PLATFORM:
for i in range(1,750):
list_url.append('https://cod.tracker.gg/warzone/leaderboards/battle-royale/' + platform + '/KdRatio?country=' + region + '&page=' + str(i))
leaderboardExecutor(list_url,30)
def leaderboardExecutor(urls,threads):
global PAGE_LIMIT
global INTERNET
if len(urls) > 0:
with ThreadPoolExecutor(max_workers=threads) as executor:
while True:
if PAGE_LIMIT == False:
executor.shutdown(cancel_futures=True)
while INTERNET == False:
try:
print('bad internet')
requests.get("http://google.com")
INTERNET = True
except:
time.sleep(3)
print('waited')
executor.map(scrapeLeaderboardPage, urls)
def scrapeLeaderboardPage(url):
global PAGE_LIMIT
checkInternet()
try:
page = requests.get(url)
soup = BeautifulSoup(page.content,features = 'lxml')
table_rows = soup.find_all('tr')
if len(table_rows) == 0:
PAGE_LIMIT = False
print(url)
else:
pass
print('success')
except:
INTERNET = False
leaderboardLister()

Index out of range when sending requests in a loop

I encounter an index out of range error when I try to get the number of contributors of a GitHub project in a loop. After some iterations (which are working perfectly) it just throws that exception. I have no clue why ...
for x in range(100):
r = requests.get('https://github.com/tipsy/profile-summary-for-github')
xpath = '//span[contains(#class, "num") and following-sibling::text()[normalize-space()="contributors"]]/text()'
contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
print(contributors_number) # prints the correct number until the exception
Here's the exception.
----> 4 contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
IndexError: list index out of range
It seems likely that you're getting a 429 - Too many requests since you're firing requests of one after the other.
You might want to modify your code as such:
import time
for index in range(100):
r = requests.get('https://github.com/tipsy/profile-summary-for-github')
xpath = '//span[contains(#class, "num") and following-sibling::text()[normalize-space()="contributors"]]/text()'
contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
print(contributors_number)
time.sleep(3) # Wait a bit before firing of another request
Better yet would be:
import time
for index in range(100):
r = requests.get('https://github.com/tipsy/profile-summary-for-github')
if r.status_code in [200]: # Check if the request was successful
xpath = '//span[contains(#class, "num") and following-sibling::text()[normalize-space()="contributors"]]/text()'
contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
print(contributors_number)
else:
print("Failed fetching page, status code: " + str(r.status_code))
time.sleep(3) # Wait a bit before firing of another request
Now this works perfectly for me while using the API. Probably the cleanest way of doing it.
import requests
import json
url = 'https://api.github.com/repos/valentinxxx/nginxconfig.io/commits?&per_page=100'
response = requests.get(url)
commits = json.loads(response.text)
commits_total = len(commits)
page_number = 1
while(len(commits) == 100):
page_number += 1
url = 'https://api.github.com/repos/valentinxxx/nginxconfig.io/commits?&per_page=100'+'&page='+str(page_number)
response = requests.get(url)
commits = json.loads(response.text)
commits_total += len(commits)
GitHub is blocking your repeated requests. Do not scrape sites in quick succession, many website operators actively block too many requests. As a result, the content that is returned no longer matches your XPath query.
You should be using the REST API that GitHub provides to retrieve project stats like the number of contributors, and you should implement some kind of rate limiting. There is no need to retrieve the same number 100 times, contributor counts do not change that rapidly.
API responses include information on how many requests you can make in a time window, and you can use conditional requests to only incur rate limit costs when the data actually has changed:
import requests
import time
from urllib.parse import parse_qsl, urlparse
owner, repo = 'tipsy', 'profile-summary-for-github'
github_username = '....'
# token = '....' # optional Github basic auth token
stats = 'https://api.github.com/repos/{}/{}/contributors'
with requests.session() as sess:
# GitHub requests you use your username or appname in the header
sess.headers['User-Agent'] += ' - {}'.format(github_username)
# Consider logging in! You'll get more quota
# sess.auth = (github_username, token)
# start with the first, move to the last when available, include anonymous
last_page = stats.format(owner, repo) + '?per_page=100&page=1&anon=true'
while True:
r = sess.get(last_page)
if r.status_code == requests.codes.not_found:
print("No such repo")
break
if r.status_code == requests.codes.no_content:
print("No contributors, repository is empty")
break
if r.status_code == requests.codes.accepted:
print("Stats not yet ready, retrying")
elif r.status_code == requests.codes.not_modified:
print("Stats not changed")
elif r.ok:
# success! Check for a last page, get that instead of current
# to get accurate count
link_last = r.links.get('last', {}).get('url')
if link_last and r.url != link_last:
last_page = link_last
else:
# this is the last page, report on count
params = dict(parse_qsl(urlparse(r.url).query))
page_num = int(params.get('page', '1'))
per_page = int(params.get('per_page', '100'))
contributor_count = len(r.json()) + (per_page * (page_num - 1))
print("Contributor count:", contributor_count)
# only get us a fresh response next time
sess.headers['If-None-Match'] = r.headers['ETag']
# pace ourselves following the rate limit
window_remaining = int(r.headers['X-RateLimit-Reset']) - time.time()
rate_remaining = int(r.headers['X-RateLimit-Remaining'])
# sleep long enough to honour the rate limit or at least 100 milliseconds
time.sleep(max(window_remaining / rate_remaining, 0.1))
The above uses a requests session object to handle repeated headers and ensure that you get to reuse connections where possible.
A good library such as github3.py (incidentally written by a requests core contributor) will take care of most of those details for you.
If you do want to persist on scraping the site directly, you do take a risk that the site operators block you altogether. Try to take some responsibility by not hammering the site continually.
That means that at the very least, you should honour the Retry-After header that GitHub gives you on 429:
if not r.ok:
print("Received a response other that 200 OK:", r.status_code, r.reason)
retry_after = r.headers.get('Retry-After')
if retry_after is not None:
print("Response included a Retry-After:", retry_after)
time.sleep(int(retry_after))
else:
# parse OK response

Can't modify my script to delimit the number of requests while scraping

I've written a script in python using Thread to handl multiple requests at the same time and do the scraping process faster. The script is doing it's job accordingly.
In short what the scraper does: It parses all the links from the landing page leading to its main page (where information are stored) and scrape happy hours and featured special from there. The scrapers keeps going on until all the 29 pages are crawled.
As there may be numerous links to play with, I would like to limit the number of requests. However, as I don't have much idea on this I can't find any ideal way to modify my existing script to serve the purpose.
Any help will be vastly appreciated.
This is my attempt so far:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import threading
url = "https://www.totalhappyhour.com/washington-dc-happy-hour/?page={}"
def get_info(link):
for mlink in [link.format(page) for page in range(1,30)]:
response = requests.get(mlink)
soup = BeautifulSoup(response.text,"lxml")
itemlinks = [urljoin(link,container.select_one("h2.name a").get("href")) for container in soup.select(".profile")]
threads = []
for ilink in itemlinks:
thread = threading.Thread(target=fetch_info,args=(ilink,))
thread.start()
threads+=[thread]
for thread in threads:
thread.join()
def fetch_info(nlink):
response = requests.get(nlink)
soup = BeautifulSoup(response.text,"lxml")
for container in soup.select(".specials"):
try:
hours = container.select_one("h3").text
except Exception: hours = ""
try:
fspecial = ' '.join([item.text for item in container.select(".special")])
except Exception: fspecial = ""
print(f'{hours}---{fspecial}')
if __name__ == '__main__':
get_info(url)
You should look at asyncio, it's great simple and can help you do things faster!
Also multiprocessing.Pool can simplified your code (in case you don't want to use asyncio).
multiprocessing.pool also have ThreadPool equivalent if you prefer to to use threads.
About the requests limit, I recommend you to use threading.Semaphore (or any other semaphore in case you switch from threading)
threading approach:
from multiprocessing.pool import ThreadPool as Pool
from threading import Semaphore
from time import sleep
MAX_RUN_AT_ONCE = 5
NUMBER_OF_THREADS = 10
sm = Semaphore(MAX_RUN_AT_ONCE)
def do_task(number):
with sm:
print(f"run with {number}")
sleep(3)
return number * 2
def main():
p = Pool(NUMBER_OF_THREADS)
results = p.map(do_task, range(10))
print(results)
if __name__ == '__main__':
main()
multiprocessing approach:
from multiprocessing import Pool
from multiprocessing import Semaphore
from time import sleep
MAX_RUN_AT_ONCE = 5
NUMBER_OF_PROCESS = 10
semaphore = None
def initializer(sm):
"""init the semaphore for the child process"""
global semaphore
semaphore = sm
def do_task(number):
with semaphore:
print(f"run with {number}\n")
sleep(3)
return number * 2
def main():
sm = Semaphore(MAX_RUN_AT_ONCE)
p = Pool(NUMBER_OF_PROCESS, initializer=initializer,
initargs=[sm])
results = p.map(do_task, range(10))
print(results)
if __name__ == '__main__':
main()
asyncio approch:
import asyncio
MAX_RUN_AT_ONCE = 5
sm = asyncio.Semaphore(MAX_RUN_AT_ONCE)
async def do_task(number):
async with sm:
print(f"run with {number}\n")
await asyncio.sleep(3)
return number * 2
async def main():
coros = [do_task(number) for number in range(10)]
finished, _ = await asyncio.wait(coros)
print([fut.result() for fut in finished])
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
for conducting http requests with asyncio you should use aiohttp, you can also use requests with loop.run_in_executor but then just don't use asyncio at all, because all your code is pretty much requests.
output:
run with 0
run with 1
run with 2
run with 3
run with 4
(here there is a pause du to the semaphore and sleep)
run with 5
run with 6
run with 7
run with 8
run with 9
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
you can also check ThreadPoolExecutor
As I'm very new to create any scraper using multiprocessing, I expected to have any real-life script in order to understand the logic very clearly. The site used within the script has some bot protection mechanism. However, i've found out a very similar webpage to apply multiprocessing within it.
import requests
from multiprocessing import Pool
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "http://srar.com/roster/index.php?agent_search={}"
def get_links(link):
completelinks = []
for ilink in [chr(i) for i in range(ord('a'),ord('d')+1)]:
res = requests.get(link.format(ilink))
soup = BeautifulSoup(res.text,'lxml')
for items in soup.select("table.border tr"):
if not items.select("td a[href^='index.php?agent']"):continue
data = [urljoin(link,item.get("href")) for item in items.select("td a[href^='index.php?agent']")]
completelinks.extend(data)
return completelinks
def get_info(nlink):
req = requests.get(nlink)
sauce = BeautifulSoup(req.text,"lxml")
for tr in sauce.select("table[style$='1px;'] tr"):
table = [td.get_text(strip=True) for td in tr.select("td")]
print(table)
if __name__ == '__main__':
allurls = get_links(url)
with Pool(10) as p: ##this is the number responsible for limiting the number of requests
p.map(get_info,allurls)
p.join()
Although I'm not sure I could implement the logic of ThreadPool within the following script which has already been described in SocketPlayer's answer, It seems to be working flawlessly. Feel free to rectify, If I went anywhere wrong.
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool as Pool
from threading import Semaphore
MAX_RUN_AT_ONCE = 5
NUMBER_OF_THREADS = 10
sm = Semaphore(MAX_RUN_AT_ONCE)
url = "http://srar.com/roster/index.php?agent_search={}"
def get_links(link):
with sm:
completelinks = []
for ilink in [chr(i) for i in range(ord('a'),ord('d')+1)]:
res = requests.get(link.format(ilink))
soup = BeautifulSoup(res.text,'lxml')
for items in soup.select("table.border tr"):
if not items.select("td a[href^='index.php?agent']"):continue
data = [urljoin(link,item.get("href")) for item in items.select("td a[href^='index.php?agent']")]
completelinks.extend(data)
return completelinks
def get_info(nlink):
req = requests.get(nlink)
sauce = BeautifulSoup(req.text,"lxml")
for tr in sauce.select("table[style$='1px;'] tr")[1:]:
table = [td.get_text(strip=True) for td in tr.select("td")]
print(table)
if __name__ == '__main__':
p = Pool(NUMBER_OF_THREADS)
p.map(get_info, get_links(url))

How to multithread a function in Flask?

I have a function that gets the current price of any currency that i choose using the financial google, i want to multithread it so i can send any request separately .
Here is my code:
def currency_converter(amount, currency):
url = 'https://finance.google.com/finance/converter?a={}&from=KGS&to={}&meta=ei%3DmSr0WeHCCYvBsAH8n6OIBA'.format(amount, currency)
urlHandler = urllib2.urlopen(url)
html = urlHandler.read()
bsoup = BeautifulSoup(html, 'lxml')
num = bsoup.find('span').text.split()[0]
return float(num)
#main_route.app_template_filter('currency_converter')
def thread_me(amount, currency):
t = threading.Thread(target=currency_converter, args=[amount, currency])
t.start()
t.join()
return t
here is how i run the filter inside my template:
{{ product.price|float|currency_converter('RUB') }} руб
Here i am returning the t value , i want to return the data from the api , please how can i make that done ??
Another problem that i forgot to mention that, if i opened any product page the page delays for about 10 seconds !!
You should try multiprocessing instead:
from multiprocessing.pool import ThreadPool
#currency_converter code
#main_route.app_template_filter('currency_converter')
def thread_me(amount, currency):
pool = ThreadPool(processes=1)
result = pool.apply_async(currency_converter, (amount, currency))
return result.get()

Categories

Resources