When trying to send a function to the stream that parses the page and then executes the html.render, an error occurs:
Error: There is no current event loop in thread 'Thread-1 (take_proxy_us_spys_one_thread)
I started talking about a similar problem and realized that a friend here somehow managed to implement this. But I still get an error.
Here is my code which should be repeated all the time.
Help, please, to understand.
import urllib3
import requests
import time
from requests_html import HTMLSession
import threading
import fake_useragent
def take_proxy_us_spys_one(urls: list=[], header:dict = None,):
for url in urls:
try:
url_first = 'https://spys.one'
r = requests.get(url_first, headers=header)
cookies = r.cookies
session = HTMLSession()
r = session.post(url,
data={'xx00': '','xpp': '5','xf1': '0','xf2': '0','xf3': '0','xf4': '0', 'xf5': '0'},
headers=header,
cookies=cookies)
r.html.render(reload=False,)
print(str(r))
except Exception as exc:
print("Error: " + str(exc))
def take_proxy_us_spys_one_thread(event, sleeptime= 60, urls=[], lock = None):
while event.is_set():
try:
user = fake_useragent.UserAgent().random
header = {'User-Agent': user}
lock.acquire() if lock!=None else None
proxies_1 = take_proxy_us_spys_one(urls=urls, header=header)
lock.release() if lock != None else None
time.sleep(sleeptime)
except Exception as exc:
print("Error: " + str(exc))
time.sleep(sleeptime)
if __name__ == '__main__':
start_in_thread = True
urllib3.disable_warnings()
urls_spys_one = [
'https://spys.one/free-proxy-list/ALL/'
]
lock = threading.Lock()
event = threading.Event()
event.set()
t2 = threading.Thread(target=take_proxy_us_spys_one_thread, args=(event, 10, urls_spys_one, lock),).start()
I tried to implement the mechanism from here.
Related
I need to parse html from list of domains (only main pages)
Script works well for a period of time, then it's getting no data with very high speed. Looks like requests doesn't even send.
My code:
import asyncio
import time
import aiohttp
import pandas as pd
import json
from bs4 import BeautifulSoup
df = pd.read_excel('work_file.xlsx')
domains_count = df.shape[0]
start_time = time.time()
print(start_time)
data = {}
async def get_data(session, url, j):
try:
async with session.get(url) as resp:
html = await resp.text()
rawhtml = BeautifulSoup(html, 'lxml')
title = rawhtml.title
data[url] = {'url': url, 'resp': resp.status, 'title': str(title), 'html': str(rawhtml)}
print(j)
print(url)
except Exception as e:
data[url] = {'url': url, 'resp': str(e), 'title': 'None', 'html': 'None'}
print(j)
print(url)
print(str(e))
async def get_queue():
tasks = []
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=120)) as session:
for j, i in enumerate(df.domain):
i = 'http://' + i.lower()
task = asyncio.create_task(get_data(session, i, j))
tasks.append(task)
await asyncio.gather(*tasks)
if __name__ == '__main__':
asyncio.run(get_queue())
with open('parsed_data.json', 'a+') as file:
file.write(json.dumps(data))
end_time = time.time() - start_time
print(end_time)
That was timeout error.
'resp': str(e)
This code prints only error exception message. TimeOut Error has no exception message, so str(e) = empty string.
str(repr(e)) helps to see an Error.
I understand that this is a duplicate, but I havent had that "ah-ha" moment where I understand HOW to access the a classes variable. In this code, I am crawling a website from a list of thousands of pages. Those jobs are submitted via concurrent.futures.
I want to be able to return the value of "results". I've used self.results within def __init__(self, url_list, threads) and I cant seem to pull that variable when I try print(example.results.
If self.results is returning a value, but example.results isn't pulling it from if __name__ == '__main__':, how can you access that? I know I've done something wrong, but I don't know what it is.
from concurrent.futures import ThreadPoolExecutor
from proxy_def import *
import requests
from bs4 import BeautifulSoup
from parsers import *
site = 0
class ConcurrentListCrawler(object):
def __init__(self, url_list, threads):
self.urls = url_list
self.results = {}
self.max_threads = threads
def __make_request(self, url):
try:
r = requests.get(url=url, timeout=20)
r.raise_for_status()
print(countit(), r.url)
except requests.exceptions.Timeout:
r = requests.get(url=url, timeout=60)
except requests.exceptions.ConnectionError:
r = requests.get(url=url, timeout=60)
except requests.exceptions.RequestException as e:
raise e
return r.url, r.text
def __parse_results(self, url, html):
try:
print(url)
trip_data = restaurant_parse(url)
except Exception as e:
raise e
if trip_data:
print('here we go')
self.results = trip_data
#print(self.results)
return self.results
def wrapper(self, url):
url, html = self.__make_request(url)
self.__parse_results(url, html)
def run_script(self):
with ThreadPoolExecutor(max_workers=min(len(self.urls),self.max_threads)) as Executor:
jobs = [Executor.submit(self.wrapper, u) for u in self.urls]
if __name__ == '__main__':
listo = loadit()
print(listo)
print(len(listo))
example = ConcurrentListCrawler(listo, 10)
example.run_script()
print(example.results)
Any pointers would be greatly appreciated.
I believe one of your methods is not returning the results.
Make the following change.
def wrapper(self, url):
url, html = self.__make_request(url)
return self.__parse_results(url, html)
After this, I suggest you utilize the self.results as a dictionary, like it was declared.
In the method "__parse_results(..)", append trip_data to self.results as follows, instead of assigning.
def __parse_results(self, url, html):
try:
print(url)
trip_data = restaurant_parse(url)
except Exception as e:
raise e
if trip_data:
print('here we go')
self.results[url] = trip_data
#print(self.results)
return self.results
When you append to self.results, it would retain the older values and you may avoid replacing by reassignment.
The issue was that I submitted all the jobs at once through a list. I was unable to pull the variable from the class because print(example.results) because that part of the code isnt access until all jobs are complete. With that I was able to resolve by getting rid of the class (even though the title of this posting indicates that this is the issue).
from concurrent.futures import ThreadPoolExecutor
import concurrent
from proxy_def import *
import requests
from bs4 import BeautifulSoup
from parsers import *
site = 0
def load_url(url):
try:
print(countit(), url)
trip_data = restaurant_parse(url)
return trip_data
except Exception as e:
raise e
if __name__ == '__main__':
URLs = loadit()
#print(URLs)
#print(len(URLs))
with ThreadPoolExecutor(max_workers=10) as executor:
# start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url): url for url in URLs}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
print('this is data', data)
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
Here, I can pull the dictionary by grabbing data.
Thanks for the help, everyone.
This my code which basically just takes a list of 94,000+ URLs, and collects the http_status codes for them:
#!/usr/bin/python3
import threading
from queue import Queue
import urllib.request
import urllib.parse
from http.client import HTTPConnection
import socket
import http.client
#import httplib
url_input = open("urls_prod_sort.txt", "r").read()
urls = url_input[:url_input.rfind('\n')].split('\n')
#urls = urls[:100]
url_502 = []
url_logs = []
url_502_lock = threading.Lock()
print_lock = threading.Lock()
def sendRequest(url_u, http_method = 'GET', data = None):
use_proxy = "http://xxxxxxxx:8080"
proxies = {"http": use_proxy}
proxy = urllib.request.ProxyHandler(proxies)
handler = urllib.request.HTTPHandler()
url = "http://" + url_u
with print_lock:
print(url)
opener = urllib.request.build_opener(proxy,handler)
urllib.request.install_opener(opener)
request = urllib.request.Request(url,data)
request.add_header("User-agent","| MSIE |")
request.get_method = lambda: http_method
try:
response = urllib.request.urlopen(request)
response_code = response.code
except urllib.error.HTTPError as error:
response_code = error.code
except urllib.error.URLError as e2:
response_code = 701
except socket.timeout as e3:
response_code = 702
except socket.error as e4:
response_code = 703
except http.client.IncompleteRead as e:
response_code = 700
if response_code == 502:
with url_502_lock:
#url_502.append(url)
url_502_file = open("url_502_file.txt", "a")
url_502_file.write(url + "\n")
url_502_file.close()
with print_lock:
#url_logs.append(url + "," + str(response_code))
url_all_logs_file = open("url_all_logs.csv", "a")
url_all_logs_file.write(url + "," + str(response_code) + '\n')
url_all_logs_file.close()
#print (url + "," + str(response_code))
#print (response_code)
return response_code
def worker():
while True:
url = q.get()
if url == ":::::"
break
else:
sendRequest(url)
q.task_done()
#======================================
q = Queue()
for threads in range(1000):
t = threading.Thread(target = worker)
t.daemon = True
t.start()
for url in urls:
q.put(url)
q.put(":::::")
q.join()
However, the program never seems to terminate (even tho the URLs have all been iteratred through) which forces me to ctrl-c the program - and then I get the following error:
Traceback (most recent call last):
File "./url_sc_checker.py", line 120, in <module>
q.join()
File "/usr/lib/python3.2/queue.py", line 82, in join
self.all_tasks_done.wait()
File "/usr/lib/python3.2/threading.py", line 235, in wait
waiter.acquire()
KeyboardInterrupt
The reason that your program doesn't terminate is simple, your worker creates an infinite loop:
def worker():
while True:
...
You need to either throw an exception, break, or have a terminating condition in your while statement. Otherwise your program would remain trying to get the next job from the queue, without knowing that there will never be the next job.
A common way to do this is to put a sentinel value in your queue, when checking out a job from the queue, the worker checks if it is the sentinel value and breaks out the loop.
Another way is to have a global condition variable that you check in the while condition. When the job producer have pushed all items to the queue, the job producer joins the queue, and when all jobs are done, the job producer unblocks and terminates the threads our processes.
Another possible reason why your process doesn't terminate is if your sendRequest produces an unexpected exception, then the thread terminates and you'll be left with some jobs that are never marked as done.
I have a list of 80 usernames right now and I have my script check if each username exists or not. However it takes a little longer than I like so I was wondering if there is anything I can do to speed up how long it takes to check if each username exists or not.
# ------------------------------
# Mass Kik Username Checker
# Script Made by: Ski
# ------------------------------
import requests, threading
def check(username):
try:
req = requests.get("http://kik.me/"+username, allow_redirects=False).status_code
if req == 302:
return False
if req == 200:
return True
except Exception as e:
print e
exit()
def _loadList(filename):
item_list = []
for item in str(open(filename, "r").read()).split("\n"):
item_list.append(item)
return item_list
def _thread(items):
global _usernames
for username in _usernames[items[0]:items[1]]:
exists = check(username)
if exists:
print username+" exists\n"
if not exists:
print username+" doesn't exist\n"
if __name__ == '__main__':
_usernames = _loadList("usernames.txt")
thread1 = threading.Thread(target=_thread, args=([0, 20], )).start()
thread2 = threading.Thread(target=_thread, args=([20, 40], )).start()
thread3 = threading.Thread(target=_thread, args=([40, 60], )).start()
thread4 = threading.Thread(target=_thread, args=([60, 80], )).start()
Try out Python 3.x Pool of threads. You can define how many workers will perform the request. Using more (ex. 32) than 4, would speed-up your code dramatically.
import requests
from concurrent.futures import ThreadPoolExecutor
NUM_OF_WORKERS=32
def check(username):
try:
req = requests.get("http://kik.me/"+username, allow_redirects=False).status_code
if req == 302:
print(username, " does not exist.")
if req == 200:
print(username, "exists.")
except Exception as error:
print(error)
usernames = _loadList(filename)
with ThreadPoolExecutor(max_workers=NUM_OF_WORKERS) as pool:
pool.map(check, usernames)
This makes your code way more readable as well.
EDIT: noticed now the Python 2.7 tag.
Python 2 has a Pool of threads which is available under multiprocessing module. Unfortunately it's not documented as no tests were made available.
import requests
from multiprocessing.pool import ThreadPool
NUM_OF_WORKERS=32
def check(username):
try:
req = requests.get("http://kik.me/"+username, allow_redirects=False).status_code
if req == 302:
print(username, " does not exist.")
if req == 200:
print(username, "exists.")
except Exception as error:
print(error)
usernames = _loadList(filename)
pool = ThreadPool(processes=NUM_OF_WORKERS)
pool.map_async(check, usernames)
pool.close()
pool.join()
If you want a better Pool of Threads for Python 2, you can try the Pebble module
I've read many answers, however I have not found a proper solution.
The problem, I'm reading mixed/replace HTTP streams that will not expire or end by default.
You can try it by yourself using curl:
curl http://agent.mtconnect.org/sample\?interval\=0
So, now I'm using Python threads and requests to read data from multiple streams.
import requests
import uuid
from threading import Thread
tasks = ['http://agent.mtconnect.org/sample?interval=5000',
'http://agent.mtconnect.org/sample?interval=10000']
thread_id = []
def http_handler(thread_id, url, flag):
print 'Starting task %s' % thread_id
try:
requests_stream = requests.get(url, stream=True, timeout=2)
for line in requests_stream.iter_lines():
if line:
print line
if flag and line.endswith('</MTConnectStreams>'):
# Wait until XML message end is reached to receive the full message
break
except requests.exceptions.RequestException as e:
print('error: ', e)
except BaseException as e:
print e
if __name__ == '__main__':
for task in tasks:
uid = str(uuid.uuid4())
thread_id.append(uid)
t = Thread(target=http_handler, args=(uid, task, False), name=uid)
t.start()
print thread_id
# Wait Time X or until user is doing something
# Send flag = to desired thread to indicate the loop should stop after reaching the end.
Any suggestions? What is the best solution? I don't want to kill the thread because I would like to read the ending to have a full XML message.
I found a solution by using threading module and threading.events. Maybe not the best solution, but it works fine currently.
import logging
import threading
import time
import uuid
import requests
logging.basicConfig(level=logging.DEBUG, format='(%(threadName)-10s) %(message)s', )
tasks = ['http://agent.mtconnect.org/sample?interval=5000',
'http://agent.mtconnect.org/sample?interval=10000']
d = dict()
def http_handler(e, url):
logging.debug('wait_for_event starting')
message_buffer = []
filter_namespace = True
try:
requests_stream = requests.get(url, stream=True, timeout=2)
for line in requests_stream.iter_lines():
if line:
message_buffer.append(line)
if e.isSet() and line.endswith('</MTConnectStreams>'):
logging.debug(len(message_buffer))
break
except requests.exceptions.RequestException as e:
print('error: ', e)
except BaseException as e:
print e
if __name__ == '__main__':
logging.debug('Waiting before calling Event.set()')
for task in tasks:
uid = str(uuid.uuid4())
e = threading.Event()
d[uid] = {"stop_event": e}
t = threading.Event(uid)
t = threading.Thread(name=uid,
target=http_handler,
args=(e, task))
t.start()
logging.debug('Waiting 3 seconds before calling Event.set()')
for key in d:
time.sleep(3)
logging.debug(threading.enumerate())
logging.debug(d[key])
d[key]['stop_event'].set()
logging.debug('bye')