I have a list of 80 usernames right now and I have my script check if each username exists or not. However it takes a little longer than I like so I was wondering if there is anything I can do to speed up how long it takes to check if each username exists or not.
# ------------------------------
# Mass Kik Username Checker
# Script Made by: Ski
# ------------------------------
import requests, threading
def check(username):
try:
req = requests.get("http://kik.me/"+username, allow_redirects=False).status_code
if req == 302:
return False
if req == 200:
return True
except Exception as e:
print e
exit()
def _loadList(filename):
item_list = []
for item in str(open(filename, "r").read()).split("\n"):
item_list.append(item)
return item_list
def _thread(items):
global _usernames
for username in _usernames[items[0]:items[1]]:
exists = check(username)
if exists:
print username+" exists\n"
if not exists:
print username+" doesn't exist\n"
if __name__ == '__main__':
_usernames = _loadList("usernames.txt")
thread1 = threading.Thread(target=_thread, args=([0, 20], )).start()
thread2 = threading.Thread(target=_thread, args=([20, 40], )).start()
thread3 = threading.Thread(target=_thread, args=([40, 60], )).start()
thread4 = threading.Thread(target=_thread, args=([60, 80], )).start()
Try out Python 3.x Pool of threads. You can define how many workers will perform the request. Using more (ex. 32) than 4, would speed-up your code dramatically.
import requests
from concurrent.futures import ThreadPoolExecutor
NUM_OF_WORKERS=32
def check(username):
try:
req = requests.get("http://kik.me/"+username, allow_redirects=False).status_code
if req == 302:
print(username, " does not exist.")
if req == 200:
print(username, "exists.")
except Exception as error:
print(error)
usernames = _loadList(filename)
with ThreadPoolExecutor(max_workers=NUM_OF_WORKERS) as pool:
pool.map(check, usernames)
This makes your code way more readable as well.
EDIT: noticed now the Python 2.7 tag.
Python 2 has a Pool of threads which is available under multiprocessing module. Unfortunately it's not documented as no tests were made available.
import requests
from multiprocessing.pool import ThreadPool
NUM_OF_WORKERS=32
def check(username):
try:
req = requests.get("http://kik.me/"+username, allow_redirects=False).status_code
if req == 302:
print(username, " does not exist.")
if req == 200:
print(username, "exists.")
except Exception as error:
print(error)
usernames = _loadList(filename)
pool = ThreadPool(processes=NUM_OF_WORKERS)
pool.map_async(check, usernames)
pool.close()
pool.join()
If you want a better Pool of Threads for Python 2, you can try the Pebble module
Related
When trying to send a function to the stream that parses the page and then executes the html.render, an error occurs:
Error: There is no current event loop in thread 'Thread-1 (take_proxy_us_spys_one_thread)
I started talking about a similar problem and realized that a friend here somehow managed to implement this. But I still get an error.
Here is my code which should be repeated all the time.
Help, please, to understand.
import urllib3
import requests
import time
from requests_html import HTMLSession
import threading
import fake_useragent
def take_proxy_us_spys_one(urls: list=[], header:dict = None,):
for url in urls:
try:
url_first = 'https://spys.one'
r = requests.get(url_first, headers=header)
cookies = r.cookies
session = HTMLSession()
r = session.post(url,
data={'xx00': '','xpp': '5','xf1': '0','xf2': '0','xf3': '0','xf4': '0', 'xf5': '0'},
headers=header,
cookies=cookies)
r.html.render(reload=False,)
print(str(r))
except Exception as exc:
print("Error: " + str(exc))
def take_proxy_us_spys_one_thread(event, sleeptime= 60, urls=[], lock = None):
while event.is_set():
try:
user = fake_useragent.UserAgent().random
header = {'User-Agent': user}
lock.acquire() if lock!=None else None
proxies_1 = take_proxy_us_spys_one(urls=urls, header=header)
lock.release() if lock != None else None
time.sleep(sleeptime)
except Exception as exc:
print("Error: " + str(exc))
time.sleep(sleeptime)
if __name__ == '__main__':
start_in_thread = True
urllib3.disable_warnings()
urls_spys_one = [
'https://spys.one/free-proxy-list/ALL/'
]
lock = threading.Lock()
event = threading.Event()
event.set()
t2 = threading.Thread(target=take_proxy_us_spys_one_thread, args=(event, 10, urls_spys_one, lock),).start()
I tried to implement the mechanism from here.
i need a script to make it like a cpanel checker, with more than 1 url and the url is stored in a txt file.
usage : python script.py list.txt
format in file list.txt : https://demo.cpanel.net:2083|democom|DemoCoA5620
this is my code but it doesn't work, can someone help me?
Thanks.
import requests, sys
from multiprocessing.dummy import Pool as ThreadPool
try:
with open(sys.argv[1], 'r') as f:
list_data = [line.strip() for line in f if line.strip()]
except IOError:
pass
def cpanel(url):
try:
data = {'user':'democom', 'pass':'DemoCoA5620'}
r = requests.post(url, data=data)
if r.status_code==200:
print "login success"
else:
print "login failed"
except:
pass
def chekers(url):
try:
cpanel(url)
except:
pass
def Main():
try:
start = timer()
pp = ThreadPool(25)
pr = pp.map(chekers, list_data)
print('Time: ' + str(timer() - start) + ' seconds')
except:
pass
if __name__ == '__main__':
Main()
I fixed your code in a way that it will return an actual array containing a boolean array indicating the success of the cpanel function.
from __future__ import print_function
import requests
from multiprocessing.pool import ThreadPool
try:
list_data = ["https://demo.cpanel.net:2083|democom|DemoCoA5620",
"https://demo.cpanel.net:2083|UserDoesNotExist|WRONGPASSWORD",
]
except IOError:
pass
def cpanel(url):
try:
# try to split that url to get username / password
try:
url, username, password = url.split('|')
except Exception as e:
print("Url {} seems to have wrong format. Concrete error: {}".format(url, e))
return False
# build the correct url
url += '/login/?login_only=1'
# build post parameters
params = {'user': username,
'pass': password}
# make request
r = requests.post(url, params)
if r.status_code==200:
print("login for user {} success".format(username))
return True
else:
print("login for user {} failed due to Status Code {} and message \"{}\"".format(username, r.status_code, r.reason))
return False
except Exception as e:
print("Error occured for url {} ".format(e))
return False
def chekers(url):
return cpanel(url)
def Main():
try:
# start = timer()
pp = ThreadPool(1)
pr = pp.map(chekers, list_data)
print(pr)
# print('Time: ' + str(timer() - start) + ' seconds')
except:
pass
if __name__ == '__main__':
Main()
Output:
login for user democom success
login for user UserDoesNotExist failed due to Status Code 401 and message "Access Denied"
[True, False]
Be aware that I replaced your file read operation by some fixed urls.
Since you use request.post I guess you actually want to POST something to that urls. Your code does not do that. If you just want to send a request, use the requests.get method.
See the official documentation for the requests packet: https://2.python-requests.org/en/master/user/quickstart/#make-a-request for more details.
Also note that
"but it doesn't work"
is NOT a question.
I'm trying to get some data from a web page. To speed up this process (they allow me to make 1000 requests per minute), I use ThreadPool.
Since there is a huge amount of data, the process is quite vulnerable to connection fails etc. so I try to log everything I can to be able to detect each mistake I did in code.
The problem is that program sometimes just stops without any exception (it acts like it is running but with no effect - I use PyCharm). I log catched exceptions everywhere I can but I can't see any exception in any log.
I assume that if there were a timeout reached, the exception would be raised and logged.
I've found out where the problem could be. Here is the code:
As a pool, I use: from multiprocessing.pool import ThreadPool as Pool
And lock: from threading import Lock
The download_category function is being used in loop.
def download_category(url):
# some code
#
# ...
log('Create pool...')
_pool = Pool(_workers_number)
with open('database/temp_produkty.txt') as f:
log('Spracovavanie produktov... vytvaranie vlakien...') # I see this in log
for url_product in f:
x = _pool.apply_async(process_product, args=(url_product.strip('\n'), url))
_pool.close()
_pool.join()
log('Presuvanie produktov z temp export do export.csv...') # I can't see this in log
temp_export_to_export_csv()
set_spracovanie_kategorie(url)
except Exception as e:
logging.exception('Got exception on download_one_category: {}'.format(url))
And process_product function:
def process_product(url, cat):
try:
data = get_product_data(url)
except:
log('{}: {} exception while getting product data... #') # I don't see this in log
return
try:
print_to_temp_export(data, cat) # I don't see this in log
except:
log('{}: {} exception while printing to csv... #') # I don't see this in log
raise
LOG function:
def log(text):
now = datetime.now().strftime('%d.%m.%Y %H:%M:%S')
_lock.acquire()
mLib.printToFile('logging/log.log', '{} -> {}'.format(now, text))
_lock.release()
I use logging module too. In this log, I see that probably 8 (number of workers) times request was sent but no answer hasn't been recieved.
EDIT1:
def get_product_data(url):
data = defaultdict(lambda: '-')
root = load_root(url)
try:
nazov = root.xpath('//h1[#itemprop="name"]/text()')[0]
except:
nazov = root.xpath('//h1/text()')[0]
under_block = root.xpath('//h2[#id="lowest-cost"]')
if len(under_block) < 1:
under_block = root.xpath('//h2[contains(text(),"Naj")]')
if len(under_block) < 1:
return False
data['nazov'] = nazov
data['url'] = url
blocks = under_block[0].xpath('./following-sibling::div[#class="shp"]/div[contains(#class,"shp")]')
i = 0
for block in blocks:
i += 1
data['dat{}_men'.format(i)] = eblock.xpath('.//a[#class="link"]/text()')[0]
del root
return data
LOAD ROOT:
class RedirectException(Exception):
pass
def load_url(url):
r = requests.get(url, allow_redirects=False)
if r.status_code == 301:
raise RedirectException
if r.status_code == 404:
if '-q-' in url:
url = url.replace('-q-','-')
mLib.printToFileWOEncoding('logging/neexistujuce.txt','Skusanie {} kategorie...'.format(url))
return load_url(url) # THIS IS NOT LOOPING
else:
mLib.printToFileWOEncoding('logging/neexistujuce.txt','{}'.format(url))
html = r.text
return html
def load_root(url):
try:
html = load_url(url)
except Exception as e:
logging.exception('load_root_exception')
raise
return etree.fromstring(html, etree.HTMLParser())
This my code which basically just takes a list of 94,000+ URLs, and collects the http_status codes for them:
#!/usr/bin/python3
import threading
from queue import Queue
import urllib.request
import urllib.parse
from http.client import HTTPConnection
import socket
import http.client
#import httplib
url_input = open("urls_prod_sort.txt", "r").read()
urls = url_input[:url_input.rfind('\n')].split('\n')
#urls = urls[:100]
url_502 = []
url_logs = []
url_502_lock = threading.Lock()
print_lock = threading.Lock()
def sendRequest(url_u, http_method = 'GET', data = None):
use_proxy = "http://xxxxxxxx:8080"
proxies = {"http": use_proxy}
proxy = urllib.request.ProxyHandler(proxies)
handler = urllib.request.HTTPHandler()
url = "http://" + url_u
with print_lock:
print(url)
opener = urllib.request.build_opener(proxy,handler)
urllib.request.install_opener(opener)
request = urllib.request.Request(url,data)
request.add_header("User-agent","| MSIE |")
request.get_method = lambda: http_method
try:
response = urllib.request.urlopen(request)
response_code = response.code
except urllib.error.HTTPError as error:
response_code = error.code
except urllib.error.URLError as e2:
response_code = 701
except socket.timeout as e3:
response_code = 702
except socket.error as e4:
response_code = 703
except http.client.IncompleteRead as e:
response_code = 700
if response_code == 502:
with url_502_lock:
#url_502.append(url)
url_502_file = open("url_502_file.txt", "a")
url_502_file.write(url + "\n")
url_502_file.close()
with print_lock:
#url_logs.append(url + "," + str(response_code))
url_all_logs_file = open("url_all_logs.csv", "a")
url_all_logs_file.write(url + "," + str(response_code) + '\n')
url_all_logs_file.close()
#print (url + "," + str(response_code))
#print (response_code)
return response_code
def worker():
while True:
url = q.get()
if url == ":::::"
break
else:
sendRequest(url)
q.task_done()
#======================================
q = Queue()
for threads in range(1000):
t = threading.Thread(target = worker)
t.daemon = True
t.start()
for url in urls:
q.put(url)
q.put(":::::")
q.join()
However, the program never seems to terminate (even tho the URLs have all been iteratred through) which forces me to ctrl-c the program - and then I get the following error:
Traceback (most recent call last):
File "./url_sc_checker.py", line 120, in <module>
q.join()
File "/usr/lib/python3.2/queue.py", line 82, in join
self.all_tasks_done.wait()
File "/usr/lib/python3.2/threading.py", line 235, in wait
waiter.acquire()
KeyboardInterrupt
The reason that your program doesn't terminate is simple, your worker creates an infinite loop:
def worker():
while True:
...
You need to either throw an exception, break, or have a terminating condition in your while statement. Otherwise your program would remain trying to get the next job from the queue, without knowing that there will never be the next job.
A common way to do this is to put a sentinel value in your queue, when checking out a job from the queue, the worker checks if it is the sentinel value and breaks out the loop.
Another way is to have a global condition variable that you check in the while condition. When the job producer have pushed all items to the queue, the job producer joins the queue, and when all jobs are done, the job producer unblocks and terminates the threads our processes.
Another possible reason why your process doesn't terminate is if your sendRequest produces an unexpected exception, then the thread terminates and you'll be left with some jobs that are never marked as done.
I've read many answers, however I have not found a proper solution.
The problem, I'm reading mixed/replace HTTP streams that will not expire or end by default.
You can try it by yourself using curl:
curl http://agent.mtconnect.org/sample\?interval\=0
So, now I'm using Python threads and requests to read data from multiple streams.
import requests
import uuid
from threading import Thread
tasks = ['http://agent.mtconnect.org/sample?interval=5000',
'http://agent.mtconnect.org/sample?interval=10000']
thread_id = []
def http_handler(thread_id, url, flag):
print 'Starting task %s' % thread_id
try:
requests_stream = requests.get(url, stream=True, timeout=2)
for line in requests_stream.iter_lines():
if line:
print line
if flag and line.endswith('</MTConnectStreams>'):
# Wait until XML message end is reached to receive the full message
break
except requests.exceptions.RequestException as e:
print('error: ', e)
except BaseException as e:
print e
if __name__ == '__main__':
for task in tasks:
uid = str(uuid.uuid4())
thread_id.append(uid)
t = Thread(target=http_handler, args=(uid, task, False), name=uid)
t.start()
print thread_id
# Wait Time X or until user is doing something
# Send flag = to desired thread to indicate the loop should stop after reaching the end.
Any suggestions? What is the best solution? I don't want to kill the thread because I would like to read the ending to have a full XML message.
I found a solution by using threading module and threading.events. Maybe not the best solution, but it works fine currently.
import logging
import threading
import time
import uuid
import requests
logging.basicConfig(level=logging.DEBUG, format='(%(threadName)-10s) %(message)s', )
tasks = ['http://agent.mtconnect.org/sample?interval=5000',
'http://agent.mtconnect.org/sample?interval=10000']
d = dict()
def http_handler(e, url):
logging.debug('wait_for_event starting')
message_buffer = []
filter_namespace = True
try:
requests_stream = requests.get(url, stream=True, timeout=2)
for line in requests_stream.iter_lines():
if line:
message_buffer.append(line)
if e.isSet() and line.endswith('</MTConnectStreams>'):
logging.debug(len(message_buffer))
break
except requests.exceptions.RequestException as e:
print('error: ', e)
except BaseException as e:
print e
if __name__ == '__main__':
logging.debug('Waiting before calling Event.set()')
for task in tasks:
uid = str(uuid.uuid4())
e = threading.Event()
d[uid] = {"stop_event": e}
t = threading.Event(uid)
t = threading.Thread(name=uid,
target=http_handler,
args=(e, task))
t.start()
logging.debug('Waiting 3 seconds before calling Event.set()')
for key in d:
time.sleep(3)
logging.debug(threading.enumerate())
logging.debug(d[key])
d[key]['stop_event'].set()
logging.debug('bye')