I have a function that gets the current price of any currency that i choose using the financial google, i want to multithread it so i can send any request separately .
Here is my code:
def currency_converter(amount, currency):
url = 'https://finance.google.com/finance/converter?a={}&from=KGS&to={}&meta=ei%3DmSr0WeHCCYvBsAH8n6OIBA'.format(amount, currency)
urlHandler = urllib2.urlopen(url)
html = urlHandler.read()
bsoup = BeautifulSoup(html, 'lxml')
num = bsoup.find('span').text.split()[0]
return float(num)
#main_route.app_template_filter('currency_converter')
def thread_me(amount, currency):
t = threading.Thread(target=currency_converter, args=[amount, currency])
t.start()
t.join()
return t
here is how i run the filter inside my template:
{{ product.price|float|currency_converter('RUB') }} руб
Here i am returning the t value , i want to return the data from the api , please how can i make that done ??
Another problem that i forgot to mention that, if i opened any product page the page delays for about 10 seconds !!
You should try multiprocessing instead:
from multiprocessing.pool import ThreadPool
#currency_converter code
#main_route.app_template_filter('currency_converter')
def thread_me(amount, currency):
pool = ThreadPool(processes=1)
result = pool.apply_async(currency_converter, (amount, currency))
return result.get()
Related
I want to create a function that returns a varibable I can write to a csv.
If I write:
from makesoup import make_soup
def get_links(soupbowl):
linkname=""
for boot in soupbowl.findAll('tbody'):
for record in boot.findAll('tr', {"row0", "row1"}):
for link in record.find_all('a'):
if link.has_attr('href'):
linkname = linkname+"\n" + (link.attrs['href'])[1:]
print(linkname)
soup = make_soup("https://www.footballdb.com/teams/index.html")
pyt = get_links(soup)
print(pyt)
It prints what I want(all links on page) in the function and None with print(pyt)
Instead of print(linkname) in the function, i want to return(linkname).
But when I do I only print the first link on the page. Is there a way to pass all the links to variable pyt which is outside of the function?
Thank You in advance
Try the following, to get all the links in one go:
from makesoup import make_soup
def get_links(soupbowl):
links_found = []
linkname=""
for boot in soupbowl.findAll('tbody'):
for record in boot.findAll('tr', {"row0", "row1"}):
for link in record.find_all('a'):
if link.has_attr('href'):
linkname = linkname+"\n" + (link.attrs['href'])[1:]
links_found.append(linkname)
return links_found
soup = make_soup("https://www.footballdb.com/teams/index.html")
pyt = get_links(soup)
print(pyt)
Or use yield, to return them one by one - while you process the output for something else:
from makesoup import make_soup
def get_links(soupbowl):
linkname=""
for boot in soupbowl.findAll('tbody'):
for record in boot.findAll('tr', {"row0", "row1"}):
for link in record.find_all('a'):
if link.has_attr('href'):
linkname = linkname+"\n" + (link.attrs['href'])[1:]
yield linkname
soup = make_soup("https://www.footballdb.com/teams/index.html")
pyt = get_links(soup)
for link in pyt:
do_something()
from makesoup import make_soup
def get_links(soupbowl):
links = []
for boot in soupbowl.findAll('tbody'):
for record in boot.findAll('tr', {"row0", "row1"}):
for link in record.find_all('a'):
if link.has_attr('href'):
linkname = linkname+"\n" + (link.attrs['href'])[1:]
links.append(linkname)
return links
soup = make_soup("https://www.footballdb.com/teams/index.html")
pyt = get_links(soup)
print(pyt)
i want to pass url into task function, in order to use get different site content at one time.
But it looks like it can't run completed. It will stop unexcepted.
from scrapy.selector import Selector
def get_response_callback(content):
txt = str(content,encoding='utf-8')
resp = Selector(text=txt)
title = resp.xpath('//title/text()').extract_first()
print(title)
#defer.inlineCallbacks
def task(*args,**kwargs):
# url = 'http://www.baidu.com'
url=kwargs.get('url')
print(url)
d=getPage(url.encode('utf-8'))
d.addCallback(get_response_callback)
yield d
def done(*args,**kwargs):
reactor.stop()
task_list = []
url_list = ['http://www.baidu.com','http://www.bing.com','http://www.qq.com']
for i in url_list:
d=task(url=i)
task_list.append(d)
dd = defer.DeferredList(task_list)
dd.addBoth(done) # should use done1 to avoid not stoping the function
reactor.run()
How can i pass argument to task function ?
I've written a script in python using Thread to handl multiple requests at the same time and do the scraping process faster. The script is doing it's job accordingly.
In short what the scraper does: It parses all the links from the landing page leading to its main page (where information are stored) and scrape happy hours and featured special from there. The scrapers keeps going on until all the 29 pages are crawled.
As there may be numerous links to play with, I would like to limit the number of requests. However, as I don't have much idea on this I can't find any ideal way to modify my existing script to serve the purpose.
Any help will be vastly appreciated.
This is my attempt so far:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import threading
url = "https://www.totalhappyhour.com/washington-dc-happy-hour/?page={}"
def get_info(link):
for mlink in [link.format(page) for page in range(1,30)]:
response = requests.get(mlink)
soup = BeautifulSoup(response.text,"lxml")
itemlinks = [urljoin(link,container.select_one("h2.name a").get("href")) for container in soup.select(".profile")]
threads = []
for ilink in itemlinks:
thread = threading.Thread(target=fetch_info,args=(ilink,))
thread.start()
threads+=[thread]
for thread in threads:
thread.join()
def fetch_info(nlink):
response = requests.get(nlink)
soup = BeautifulSoup(response.text,"lxml")
for container in soup.select(".specials"):
try:
hours = container.select_one("h3").text
except Exception: hours = ""
try:
fspecial = ' '.join([item.text for item in container.select(".special")])
except Exception: fspecial = ""
print(f'{hours}---{fspecial}')
if __name__ == '__main__':
get_info(url)
You should look at asyncio, it's great simple and can help you do things faster!
Also multiprocessing.Pool can simplified your code (in case you don't want to use asyncio).
multiprocessing.pool also have ThreadPool equivalent if you prefer to to use threads.
About the requests limit, I recommend you to use threading.Semaphore (or any other semaphore in case you switch from threading)
threading approach:
from multiprocessing.pool import ThreadPool as Pool
from threading import Semaphore
from time import sleep
MAX_RUN_AT_ONCE = 5
NUMBER_OF_THREADS = 10
sm = Semaphore(MAX_RUN_AT_ONCE)
def do_task(number):
with sm:
print(f"run with {number}")
sleep(3)
return number * 2
def main():
p = Pool(NUMBER_OF_THREADS)
results = p.map(do_task, range(10))
print(results)
if __name__ == '__main__':
main()
multiprocessing approach:
from multiprocessing import Pool
from multiprocessing import Semaphore
from time import sleep
MAX_RUN_AT_ONCE = 5
NUMBER_OF_PROCESS = 10
semaphore = None
def initializer(sm):
"""init the semaphore for the child process"""
global semaphore
semaphore = sm
def do_task(number):
with semaphore:
print(f"run with {number}\n")
sleep(3)
return number * 2
def main():
sm = Semaphore(MAX_RUN_AT_ONCE)
p = Pool(NUMBER_OF_PROCESS, initializer=initializer,
initargs=[sm])
results = p.map(do_task, range(10))
print(results)
if __name__ == '__main__':
main()
asyncio approch:
import asyncio
MAX_RUN_AT_ONCE = 5
sm = asyncio.Semaphore(MAX_RUN_AT_ONCE)
async def do_task(number):
async with sm:
print(f"run with {number}\n")
await asyncio.sleep(3)
return number * 2
async def main():
coros = [do_task(number) for number in range(10)]
finished, _ = await asyncio.wait(coros)
print([fut.result() for fut in finished])
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
for conducting http requests with asyncio you should use aiohttp, you can also use requests with loop.run_in_executor but then just don't use asyncio at all, because all your code is pretty much requests.
output:
run with 0
run with 1
run with 2
run with 3
run with 4
(here there is a pause du to the semaphore and sleep)
run with 5
run with 6
run with 7
run with 8
run with 9
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
you can also check ThreadPoolExecutor
As I'm very new to create any scraper using multiprocessing, I expected to have any real-life script in order to understand the logic very clearly. The site used within the script has some bot protection mechanism. However, i've found out a very similar webpage to apply multiprocessing within it.
import requests
from multiprocessing import Pool
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "http://srar.com/roster/index.php?agent_search={}"
def get_links(link):
completelinks = []
for ilink in [chr(i) for i in range(ord('a'),ord('d')+1)]:
res = requests.get(link.format(ilink))
soup = BeautifulSoup(res.text,'lxml')
for items in soup.select("table.border tr"):
if not items.select("td a[href^='index.php?agent']"):continue
data = [urljoin(link,item.get("href")) for item in items.select("td a[href^='index.php?agent']")]
completelinks.extend(data)
return completelinks
def get_info(nlink):
req = requests.get(nlink)
sauce = BeautifulSoup(req.text,"lxml")
for tr in sauce.select("table[style$='1px;'] tr"):
table = [td.get_text(strip=True) for td in tr.select("td")]
print(table)
if __name__ == '__main__':
allurls = get_links(url)
with Pool(10) as p: ##this is the number responsible for limiting the number of requests
p.map(get_info,allurls)
p.join()
Although I'm not sure I could implement the logic of ThreadPool within the following script which has already been described in SocketPlayer's answer, It seems to be working flawlessly. Feel free to rectify, If I went anywhere wrong.
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool as Pool
from threading import Semaphore
MAX_RUN_AT_ONCE = 5
NUMBER_OF_THREADS = 10
sm = Semaphore(MAX_RUN_AT_ONCE)
url = "http://srar.com/roster/index.php?agent_search={}"
def get_links(link):
with sm:
completelinks = []
for ilink in [chr(i) for i in range(ord('a'),ord('d')+1)]:
res = requests.get(link.format(ilink))
soup = BeautifulSoup(res.text,'lxml')
for items in soup.select("table.border tr"):
if not items.select("td a[href^='index.php?agent']"):continue
data = [urljoin(link,item.get("href")) for item in items.select("td a[href^='index.php?agent']")]
completelinks.extend(data)
return completelinks
def get_info(nlink):
req = requests.get(nlink)
sauce = BeautifulSoup(req.text,"lxml")
for tr in sauce.select("table[style$='1px;'] tr")[1:]:
table = [td.get_text(strip=True) for td in tr.select("td")]
print(table)
if __name__ == '__main__':
p = Pool(NUMBER_OF_THREADS)
p.map(get_info, get_links(url))
I have made a script which constructs a checkout URL for shopify websites. This is done by appending each unique product 'variant' ID in the checkout URL and then opening the said URL in a webbrowser. To find the variant ID, i need to parse the website's sitemap to obtain the ID, which I am currenly doing in seperate threads for each product i am parsing, however with each thread added the time it takes increases by quite a lot (nearly one second).
Why is this the case? Shouldn't it take around the same time since each thread basically does the same exact thing?
For reference, one thread takes around 2.0s, two threads 2.8s and three threads around 3.8s
Here is my code:
import time
import requests
from bs4 import BeautifulSoup
import webbrowser
import threading
sitemap2 = 'https://deadstock.ca/sitemap_products_1.xml'
atc_url = 'https://deadstock.ca/cart/'
# CHANGE SITEMAP TO THE CORRECT ONE (THE SITE YOU ARE SCRAPING)
variant_list = []
def add_to_cart(keywords, size):
init = time.time()
# Initialize session
product_url = ''
parse_session = requests.Session()
response = parse_session.get(sitemap2)
soup = BeautifulSoup(response.content, 'lxml')
variant_id = 0
# Find Item
for urls in soup.find_all('url'):
for images in urls.find_all('image:image'):
if all(i in images.find('image:title').text.lower() for i in keywords):
now = time.time()
product_name = images.find('image:title').text
print('FOUND: ' + product_name + ' - ' + str(format(now-init, '.3g')) + 's')
product_url = urls.find("loc").text
if product_url != '':
response1 = parse_session.get(product_url+".xml")
soup = BeautifulSoup(response1.content,'lxml')
for variants in soup.find_all('variant'):
if size in variants.find('title').text.lower():
variant_id = variants.find('id', type='integer').text
atc_link = str(variant_id)+':1'
print(atc_link)
variant_list.append(atc_link)
try:
print("PARSED PRODUCT: " + product_name)
except UnboundLocalError:
print("Retrying")
add_to_cart(keywords, size)
def open_checkout():
url = 'https://deadstock.ca/cart/'
for var in variant_list:
url = url + var + ','
webbrowser.open_new_tab(url)
# When initializing a new thread, only change the keywords in the args, and make sure you start and join the thread.
# Change sitemap in scraper.py to your websites' sitemap
# If the script finds multiple items, the first item will be opened so please try to be very specific yet accurate.
def main():
print("Starting Script")
init = time.time()
try:
t1 = threading.Thread(target=add_to_cart, args=(['alltimers','relations','t-shirt','white'],'s',))
t2 = threading.Thread(target=add_to_cart, args=(['alltimers', 'relations', 'maroon'],'s',))
t3 = threading.Thread(target=add_to_cart, args=(['brain', 'dead','melter'], 's',))
t1.start()
t2.start()
t3.start()
t1.join()
t2.join()
t3.join()
print(variant_list)
open_checkout()
except:
print("Product not found / not yet live. Retrying..")
main()
print("Time taken: " + str(time.time()-init))
if __name__ == '__main__':
main()
Question: ... one thread takes around 2.0s, two threads 2.8s and three threads around 3.8s
Regarding your example code, you are counting the sum of all threads.
As #asettouf pointed out, there is a overhead, mean you have to pay for it.
But I assume, doing this 3 tasks threaded will be faster as doing it one after the other.
I have built a very simple web crawler to crawl ~100 small json files in the URL below. The issue is that the crawler takes more than an hour to complete. I find that hard to understand given how small the json files are. Am I doing something fundamentally wrong here?
def get_senate_vote(vote):
URL = 'https://www.govtrack.us/data/congress/113/votes/2013/s%d/data.json' % vote
response = requests.get(URL)
json_data = json.loads(response.text)
return json_data
def get_all_votes():
all_senate_votes = []
URL = "http://www.govtrack.us/data/congress/113/votes/2013"
response = requests.get(URL)
root = html.fromstring(response.content)
for a in root.xpath('/html/body/pre/a'):
link = a.xpath('text()')[0].strip()
if link[0] == 's':
vote = int(link[1:-1])
try:
vote_json = get_senate_vote(vote)
except:
return all_senate_votes
all_senate_votes.append(vote_json)
return all_senate_votes
vote_data = get_all_votes()
Here is a rather simple code sample, I've calculated the time taken for each call. On my system its taking on an average 2 secs per request, and there are 582 pages to visit, so around 19 mins without printing the JSON to the console. In your case network time plus print time may increase it.
#!/usr/bin/python
import requests
import re
import time
def find_votes():
r=requests.get("https://www.govtrack.us/data/congress/113/votes/2013/")
data = r.text
votes = re.findall('s\d+',data)
return votes
def crawl_data(votes):
print("Total pages: "+str(len(votes)))
for x in votes:
url ='https://www.govtrack.us/data/congress/113/votes/2013/'+x+'/data.json'
t1=time.time()
r=requests.get(url)
json = r.json()
print(time.time()-t1)
crawl_data(find_votes())
If you are using python 3.x and you are crawling multiple sites, for even better performances I offer warmly to you to use the aiohttp module, which implements the asynchronous principles.
For example:
import aiohttp
import asyncio
sites = ['url_1', 'url_2']
results = []
def save_reponse(result):
site_content = result.result()
results.append(site_content)
async def crawl_site(site):
async with aiohttp.ClientSession() as session:
async with session.get(site) as resp:
resp = await resp.text()
return resp
tasks = []
for site in sites:
task = asyncio.ensure_future(crawl_site(site))
task.add_done_callback(save_reponse)
tasks.append(task)
all_tasks = asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
loop.run_until_complete(all_tasks)
loop.close()
print(results)
For more reading about aiohttp.