i've been struggling for some time right now trying to find how to make this faster somehow
the code
def get_resellers(sellerid, price, userassetid, id):
data = {"expectedCurrency":1,"expectedPrice":price, "expectedSellerId":sellerid,"userAssetId":userassetid}
headers = {"X-CSRF-TOKEN":csrftoken}
k = requests.post(f"https://economy.roblox.com/v1/purchases/products/{id}", data=data, headers=headers, cookies=cookies)
def check_price(id):
while True:
try:
t0 = time.time()
soup = BeautifulSoup(requests.get(f"https://www.roblox.com/catalog/{id}").content, 'html.parser')
data_expected_price, data_expected_seller_id, data_userasset_id = soup.select_one('[data-expected-price]')['data-expected-price'], soup.select_one('[data-expected-seller-id]')['data-expected-seller-id'], soup.select_one('[data-lowest-private-sale-userasset-id]')['data-lowest-private-sale-userasset-id']
t1 = time.time()
total = t1 - t0
print(total)
if int(data_expected_price) < 0.7*int(data_expected_price):
get_resellers(data_expected_seller_id, data_expected_price, data_userasset_id, id)
except:
pass
is there any faster way to do it or extract the stuff, or make the http request etc. anything can help!
also: it takes like 0.7 seconds to buy and price check since it needs to load the site everytime is there anyway to do it faster?
Related
I have list of author names (more than 1k) and my google api quote limit is 20k I want to pass the author name into the API to get book informations. When I tested my code I got "429 Client Error: Too Many Requests for url..." error, How can I slow down my running time without stopping the application. (I'm using Python in google colab )
author_List = ["J. K. Rowling", "mark twain","Emily Dickinson"]
connGoogleAPI(author_List)
def connGoogleAPI(booksData):
key= "**************************"
books_list = []
col= ['Title', 'Authors', 'published Date', 'Description','ISBN']
books_list.append(col)
res = ""
err = None
with requests.Session() as session:
#err= ""
for Authors in booksData:
params = {"q": Authors,"key": key,"maxResults": 1}
delays = 65 # approximately 1 minute total delay time for any given author
while True:
try:
#do something
except Exception as e:
if err.status_code == 429:
#print("******")
if delays <= 0:
raise(e) # we've spent too long delaying
time.sleep(1)
delays -= 1
else:
print("-----=")
raise(e) # some other status code
books_list.append(lookup(res,Authors))
return books_list
you can import time and then add:
time.sleep(1)
at the end of your for loop, to pause for a second between each iteration.
You could slow down your for loop like this:
first, you need to import time
delay = 2
for key in author_List:
author = author_List[key]
newList.append(searchData(author))
time.sleep(delay)
The delay you can set a number how many seconds the loop gets delayed (here it would be 2 seconds)
You probably don't want unconditional delays which will slow down your processing unnecessarily. On the other hand, if you start getting HTTP 429 you can't be sure when or even if the server is going to allow you to continue. So you need a strategy that only introduces delays when/if required but also doesn't get into an infinite loop. Consider this:
import requests
import time
listofauthors = ['Mark Twain', 'Dan Brown', 'William Shakespeare']
with requests.Session() as session:
for author in listofauthors:
params = {'q': author}
delays = 60 # approximately 1 minute total delay time for any given author
while True:
try:
r = session.get('https://www.googleapis.com/books/v1/volumes', params=params)
r.raise_for_status()
print(r.json())
break # all good :-)
except Exception as e:
if r.status_code == 429:
if delays <= 0:
raise(e) # we've spent too long delaying
time.sleep(1)
delays -= 1
else:
raise(e) # some other status code
I'm trying to get the time it takes a page to fully load (like the Finish in Google's Chrome Dev Tools). I wrote something in Python but I get really low time results, way less than a second which is not realistic. This is what I have:
from urllib.request import urlopen
from time import time
class Webpage:
def __init__(self, pageName, pageUrl):
self.pageName = pageName
self.pageUrl = pageUrl
class LoadingDetail:
def __init__(self, webPage, loadingTimes): #Getting the webpage object, and it's loading times
self.webPage = webPage
self.loadingTimes = loadingTimes
pages = [
Webpage("test", "URL"),
Webpage("test2", "URL"),
Webpage("test3", "URL"),
Webpage("test4", "URL"),
]
loadingDeatils = []
for page in pages: #Going through each page in the array.
pageLoadTimes = [] #Storing the time it took the page to load.
for x in range(0, 3): #Number of times we request each page.
stream = urlopen(page.pageUrl)
startTime = time()
streamRead = stream.read()
endTime = time()
stream.close()
timeToLoad = endTime - startTime #The time it took to read the whole page.
pageLoadTimes.append(timeToLoad)
loadDetails = LoadingDetail(page, pageLoadTimes)
loadingDeatils.append(loadDetails)
I get results like 0.00011. I searched but only found Selenium based answers which I can't use.
Is there an option to do it which Python only? is Python the right tool for this? I saw answers with JS that seems to be exactly what I was looking for.
I tried it and was able to measure the time with the below code, this might help
reference: geeksforgeeks.org/timeit-python-examples/
import timeit
mysetup = "from urllib.request import urlopen"
mycode = '''
urlopen('http://www.python.org')
'''
print(timeit.timeit(setup = mysetup, stmt = mycode, number = 1))
I encounter an index out of range error when I try to get the number of contributors of a GitHub project in a loop. After some iterations (which are working perfectly) it just throws that exception. I have no clue why ...
for x in range(100):
r = requests.get('https://github.com/tipsy/profile-summary-for-github')
xpath = '//span[contains(#class, "num") and following-sibling::text()[normalize-space()="contributors"]]/text()'
contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
print(contributors_number) # prints the correct number until the exception
Here's the exception.
----> 4 contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
IndexError: list index out of range
It seems likely that you're getting a 429 - Too many requests since you're firing requests of one after the other.
You might want to modify your code as such:
import time
for index in range(100):
r = requests.get('https://github.com/tipsy/profile-summary-for-github')
xpath = '//span[contains(#class, "num") and following-sibling::text()[normalize-space()="contributors"]]/text()'
contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
print(contributors_number)
time.sleep(3) # Wait a bit before firing of another request
Better yet would be:
import time
for index in range(100):
r = requests.get('https://github.com/tipsy/profile-summary-for-github')
if r.status_code in [200]: # Check if the request was successful
xpath = '//span[contains(#class, "num") and following-sibling::text()[normalize-space()="contributors"]]/text()'
contributors_number = int(html.fromstring(r.text).xpath(xpath)[0].strip().replace(',', ''))
print(contributors_number)
else:
print("Failed fetching page, status code: " + str(r.status_code))
time.sleep(3) # Wait a bit before firing of another request
Now this works perfectly for me while using the API. Probably the cleanest way of doing it.
import requests
import json
url = 'https://api.github.com/repos/valentinxxx/nginxconfig.io/commits?&per_page=100'
response = requests.get(url)
commits = json.loads(response.text)
commits_total = len(commits)
page_number = 1
while(len(commits) == 100):
page_number += 1
url = 'https://api.github.com/repos/valentinxxx/nginxconfig.io/commits?&per_page=100'+'&page='+str(page_number)
response = requests.get(url)
commits = json.loads(response.text)
commits_total += len(commits)
GitHub is blocking your repeated requests. Do not scrape sites in quick succession, many website operators actively block too many requests. As a result, the content that is returned no longer matches your XPath query.
You should be using the REST API that GitHub provides to retrieve project stats like the number of contributors, and you should implement some kind of rate limiting. There is no need to retrieve the same number 100 times, contributor counts do not change that rapidly.
API responses include information on how many requests you can make in a time window, and you can use conditional requests to only incur rate limit costs when the data actually has changed:
import requests
import time
from urllib.parse import parse_qsl, urlparse
owner, repo = 'tipsy', 'profile-summary-for-github'
github_username = '....'
# token = '....' # optional Github basic auth token
stats = 'https://api.github.com/repos/{}/{}/contributors'
with requests.session() as sess:
# GitHub requests you use your username or appname in the header
sess.headers['User-Agent'] += ' - {}'.format(github_username)
# Consider logging in! You'll get more quota
# sess.auth = (github_username, token)
# start with the first, move to the last when available, include anonymous
last_page = stats.format(owner, repo) + '?per_page=100&page=1&anon=true'
while True:
r = sess.get(last_page)
if r.status_code == requests.codes.not_found:
print("No such repo")
break
if r.status_code == requests.codes.no_content:
print("No contributors, repository is empty")
break
if r.status_code == requests.codes.accepted:
print("Stats not yet ready, retrying")
elif r.status_code == requests.codes.not_modified:
print("Stats not changed")
elif r.ok:
# success! Check for a last page, get that instead of current
# to get accurate count
link_last = r.links.get('last', {}).get('url')
if link_last and r.url != link_last:
last_page = link_last
else:
# this is the last page, report on count
params = dict(parse_qsl(urlparse(r.url).query))
page_num = int(params.get('page', '1'))
per_page = int(params.get('per_page', '100'))
contributor_count = len(r.json()) + (per_page * (page_num - 1))
print("Contributor count:", contributor_count)
# only get us a fresh response next time
sess.headers['If-None-Match'] = r.headers['ETag']
# pace ourselves following the rate limit
window_remaining = int(r.headers['X-RateLimit-Reset']) - time.time()
rate_remaining = int(r.headers['X-RateLimit-Remaining'])
# sleep long enough to honour the rate limit or at least 100 milliseconds
time.sleep(max(window_remaining / rate_remaining, 0.1))
The above uses a requests session object to handle repeated headers and ensure that you get to reuse connections where possible.
A good library such as github3.py (incidentally written by a requests core contributor) will take care of most of those details for you.
If you do want to persist on scraping the site directly, you do take a risk that the site operators block you altogether. Try to take some responsibility by not hammering the site continually.
That means that at the very least, you should honour the Retry-After header that GitHub gives you on 429:
if not r.ok:
print("Received a response other that 200 OK:", r.status_code, r.reason)
retry_after = r.headers.get('Retry-After')
if retry_after is not None:
print("Response included a Retry-After:", retry_after)
time.sleep(int(retry_after))
else:
# parse OK response
I have made a script which constructs a checkout URL for shopify websites. This is done by appending each unique product 'variant' ID in the checkout URL and then opening the said URL in a webbrowser. To find the variant ID, i need to parse the website's sitemap to obtain the ID, which I am currenly doing in seperate threads for each product i am parsing, however with each thread added the time it takes increases by quite a lot (nearly one second).
Why is this the case? Shouldn't it take around the same time since each thread basically does the same exact thing?
For reference, one thread takes around 2.0s, two threads 2.8s and three threads around 3.8s
Here is my code:
import time
import requests
from bs4 import BeautifulSoup
import webbrowser
import threading
sitemap2 = 'https://deadstock.ca/sitemap_products_1.xml'
atc_url = 'https://deadstock.ca/cart/'
# CHANGE SITEMAP TO THE CORRECT ONE (THE SITE YOU ARE SCRAPING)
variant_list = []
def add_to_cart(keywords, size):
init = time.time()
# Initialize session
product_url = ''
parse_session = requests.Session()
response = parse_session.get(sitemap2)
soup = BeautifulSoup(response.content, 'lxml')
variant_id = 0
# Find Item
for urls in soup.find_all('url'):
for images in urls.find_all('image:image'):
if all(i in images.find('image:title').text.lower() for i in keywords):
now = time.time()
product_name = images.find('image:title').text
print('FOUND: ' + product_name + ' - ' + str(format(now-init, '.3g')) + 's')
product_url = urls.find("loc").text
if product_url != '':
response1 = parse_session.get(product_url+".xml")
soup = BeautifulSoup(response1.content,'lxml')
for variants in soup.find_all('variant'):
if size in variants.find('title').text.lower():
variant_id = variants.find('id', type='integer').text
atc_link = str(variant_id)+':1'
print(atc_link)
variant_list.append(atc_link)
try:
print("PARSED PRODUCT: " + product_name)
except UnboundLocalError:
print("Retrying")
add_to_cart(keywords, size)
def open_checkout():
url = 'https://deadstock.ca/cart/'
for var in variant_list:
url = url + var + ','
webbrowser.open_new_tab(url)
# When initializing a new thread, only change the keywords in the args, and make sure you start and join the thread.
# Change sitemap in scraper.py to your websites' sitemap
# If the script finds multiple items, the first item will be opened so please try to be very specific yet accurate.
def main():
print("Starting Script")
init = time.time()
try:
t1 = threading.Thread(target=add_to_cart, args=(['alltimers','relations','t-shirt','white'],'s',))
t2 = threading.Thread(target=add_to_cart, args=(['alltimers', 'relations', 'maroon'],'s',))
t3 = threading.Thread(target=add_to_cart, args=(['brain', 'dead','melter'], 's',))
t1.start()
t2.start()
t3.start()
t1.join()
t2.join()
t3.join()
print(variant_list)
open_checkout()
except:
print("Product not found / not yet live. Retrying..")
main()
print("Time taken: " + str(time.time()-init))
if __name__ == '__main__':
main()
Question: ... one thread takes around 2.0s, two threads 2.8s and three threads around 3.8s
Regarding your example code, you are counting​ the sum of all threads.
As #asettouf pointed out, there is a overhead, mean you have to pay for it.
But I assume, doing this 3 tasks threaded will be faster as doing it one after the other.
I have built a very simple web crawler to crawl ~100 small json files in the URL below. The issue is that the crawler takes more than an hour to complete. I find that hard to understand given how small the json files are. Am I doing something fundamentally wrong here?
def get_senate_vote(vote):
URL = 'https://www.govtrack.us/data/congress/113/votes/2013/s%d/data.json' % vote
response = requests.get(URL)
json_data = json.loads(response.text)
return json_data
def get_all_votes():
all_senate_votes = []
URL = "http://www.govtrack.us/data/congress/113/votes/2013"
response = requests.get(URL)
root = html.fromstring(response.content)
for a in root.xpath('/html/body/pre/a'):
link = a.xpath('text()')[0].strip()
if link[0] == 's':
vote = int(link[1:-1])
try:
vote_json = get_senate_vote(vote)
except:
return all_senate_votes
all_senate_votes.append(vote_json)
return all_senate_votes
vote_data = get_all_votes()
Here is a rather simple code sample, I've calculated the time taken for each call. On my system its taking on an average 2 secs per request, and there are 582 pages to visit, so around 19 mins without printing the JSON to the console. In your case network time plus print time may increase it.
#!/usr/bin/python
import requests
import re
import time
def find_votes():
r=requests.get("https://www.govtrack.us/data/congress/113/votes/2013/")
data = r.text
votes = re.findall('s\d+',data)
return votes
def crawl_data(votes):
print("Total pages: "+str(len(votes)))
for x in votes:
url ='https://www.govtrack.us/data/congress/113/votes/2013/'+x+'/data.json'
t1=time.time()
r=requests.get(url)
json = r.json()
print(time.time()-t1)
crawl_data(find_votes())
If you are using python 3.x and you are crawling multiple sites, for even better performances I offer warmly to you to use the aiohttp module, which implements the asynchronous principles.
For example:
import aiohttp
import asyncio
sites = ['url_1', 'url_2']
results = []
def save_reponse(result):
site_content = result.result()
results.append(site_content)
async def crawl_site(site):
async with aiohttp.ClientSession() as session:
async with session.get(site) as resp:
resp = await resp.text()
return resp
tasks = []
for site in sites:
task = asyncio.ensure_future(crawl_site(site))
task.add_done_callback(save_reponse)
tasks.append(task)
all_tasks = asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
loop.run_until_complete(all_tasks)
loop.close()
print(results)
For more reading about aiohttp.