Urllib giving fake proxy - python

I test my proxies with this script and it gives many working but when i test the "working" proxies with an other proxy checker only a very little amount works.
Here is the part that's checking if the proxy works:
def process(self, task):
global alive
global dead
global tested
proxy = task
log_msg = str("Trying HTTP proxy%21s " % proxy)
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(
urllib.request.HTTPCookieProcessor(cj),
urllib.request.HTTPRedirectHandler(),
urllib.request.ProxyHandler({'http': proxy})
)
try:
t1 = time.time()
response = opener.open(test_url, timeout=timeout_value).read()
tested += 1
t2 = time.time()
except Exception as e:
log_msg += "%s " % fail_msg
print(Fore.LIGHTRED_EX + log_msg)
dead += 1
tested += 1
return None
log_msg += ok_msg + "Response time: %d" % (int((t2-t1)*1000))
print(Fore.LIGHTGREEN_EX + log_msg)
text_file = open(out_filename, "a")
text_file.write(proxy + "\r\n")
text_file.close()
alive += 1

Related

Correct way to handle exception on Python2.6

I had an Python script that running continuously. If there any new file on directory then the python will open url using urllib2 to do some request on specific ip.
here are the code
encoded_string = base64.b64encode(image_file.read())
values = dumps({
'image_data':encoded_string,
'requestCode':'111'
})
headers = {"Content-Type": "application/json"}
request = Request("http:/xx.xxx.xx.xxx/api/carplate_service",data=values, headers=headers)
response = urlopen(request, timeout=60)
The code are working well but on random time, let say usually happened on 1-2 AM then I got this error:
<class 'urllib2.URLError'> - <urlopen error [Errno 110] Connection timed out>
I had an exception on that function on this bellow :
try:
ip = sys.argv[1]
histId = int(sys.argv[2])
handler = ModHandler()
wm = pyinotify.WatchManager()
notifier = pyinotify.Notifier(wm, handler)
wdd = wm.add_watch('./' + ip + '/', pyinotify.IN_CLOSE_WRITE)
notifier.loop()
except BaseException as e:
with open("error.log", "a") as text_file:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
text_file.write( time.strftime("%Y-%m-%d %H:%M:%S") + " [" + str(exc_tb.tb_lineno) + " - " + fname + "] : " + str(exc_type) + " - " + str(e) + "\n")
text_file.close();
The exception not working well because application cannot continue if there are some error like above.
My question are how to make program still continue even the exception throw?
I'm using python2.6
Thanks
For function calls that go out to external services I usually find that the following basic structure works pretty well
import time
expire_time = 2
while True:
start_time = time.time()
try:
# Do something here
# ....
# If you make it to the bottom of the code, break out of the loop
break
except BaseException as e:
# Compare the start_time with the current time
now_time = time.time()
if now_time > start_time + expire_time:
raise e
# Otherwise try executing the `try` block again
Using the code that you've provided it could look something like this
import time
expire_time = 2
while True:
start_time = time.time()
try:
ip = sys.argv[1]
histId = int(sys.argv[2])
handler = ModHandler()
wm = pyinotify.WatchManager()
notifier = pyinotify.Notifier(wm, handler)
wdd = wm.add_watch('./' + ip + '/', pyinotify.IN_CLOSE_WRITE)
notifier.loop()
break
except BaseException as e:
now_time = time.time()
if now_time > start_time + expire_time:
raise e
else:
with open("error.log", "a") as text_file:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
text_file.write( time.strftime("%Y-%m-%d %H:%M:%S") + " [" + str(exc_tb.tb_lineno) + " - " + fname + "] : " + str(exc_type) + " - " + str(e) + "\n")
text_file.close();

Python - Multithreaded Proxy Tester

I'm building a proxy checker using multithreads, specificly a thread pool from:
from multiprocessing.dummy import Pool as ThreadPool.
The http request is by using urllib2.
What I want to do is for each proxy run 20 requests. If it was 1 threaded it would take too much time. thats where the multithreads power comes to help. So once I set up the proxy I want to run those 20 requests, and manage 2 things. One is to count the exceptions and dump the proxy if too many occurs. 2nd Is to save the average response time and present it later.
I just don't manage to implement the above. But I have implemented it with 1 thread:
import socket
import ssl
import time
import urllib
import urllib2
import httplib
proxyList = []
def loadProxysFromFile(fileName):
global proxyList
with open(fileName) as f:
proxyList = [line.rstrip('\n') for line in f]
def setUrllib2Proxy(proxyAddress):
proxy = urllib2.ProxyHandler({
'http': "http://" + proxyAddress,
'https': "https://" + proxyAddress
})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
def timingRequest(proxy, url):
error = False
setUrllib2Proxy(proxy)
start = time.time()
try:
req = urllib2.Request(url)
urllib2.urlopen(req, timeout=5) #opening the request (getting a response)
except (urllib2.URLError, httplib.BadStatusLine, ssl.SSLError, socket.error) as e:
error = True
end = time.time()
timing = end - start
if error:
print "Error with proxy " + proxy
return 0
else:
print proxy + " Request to " + url + " took: %s" %timing + " seconds."
return timing
# Main
loadProxysFromFile("proxyList.txt")
for proxy in proxyList:
print "Testing: " + proxy
print "\n"
REQUEST_NUM = 20
ERROR_TOLERANCE_NUM = 3
resultList = []
for proxy in proxyList:
avgTime = 0
errorCount = 0
for x in range(0, REQUEST_NUM):
result = timingRequest(proxy, 'https://www.google.com')
if (result == 0):
errorCount += 1
if (errorCount >= ERROR_TOLERANCE_NUM):
break
else:
avgTime += result
if (errorCount < ERROR_TOLERANCE_NUM):
avgTime = avgTime/(REQUEST_NUM-errorCount)
resultList.append(proxy + " has an average response time of: %s" %avgTime)
print '\n'
print "Results Summery: "
print "-----------------"
for res in resultList:
print res
Things that must be done are:
for every proxy: wait until all 20 requests are over before changing proxy. Sync somehow the threads when they adding up to calculate the average response time (includes not to take in account the exceptions)
The best solutions I've read so far is using from multiprocessing.dummy import Pool as ThreadPool and pool.map(func, iterable) but I cant figure out how to implement it in my code.

trying to split the file download buffer to into separate threads

I am trying to download the buffer of file into 5 threads but it seems like it's getting garbled.
from numpy import arange
import requests
from threading import Thread
import urllib2
url = 'http://pymotw.com/2/urllib/index.html'
sizeInBytes = r = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers['content-length']
splitBy = 5
splits = arange(splitBy + 1) * (float(sizeInBytes)/splitBy)
dataLst = []
def bufferSplit(url, idx, splits):
req = urllib2.Request(url, headers={'Range': 'bytes=%d-%d' % (splits[idx], splits[idx+1])})
print {'bytes=%d-%d' % (splits[idx], splits[idx+1])}
dataLst.append(urllib2.urlopen(req).read())
for idx in range(splitBy):
dlth = Thread(target=bufferSplit, args=(url, idx, splits))
dlth.start()
print dataLst
with open('page.html', 'w') as fh:
fh.write(''.join(dataLst))
Update:
So I worked over and got little but progress, however if I download a jpg it seems to be corrupted;
from numpy import arange
import os
import requests
import threading
import urllib2
# url ='http://s1.fans.ge/mp3/201109/08/John_Legend_So_High_Remix(fans_ge).mp3'
url = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
# url = 'http://pymotw.com/2/urllib/index.html'
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
splitBy = 5
dataLst = []
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, url, fileName, splitBy=5):
super(ThreadedFetch, self).__init__()
self.__url = url
self.__spl = splitBy
self.__dataLst = []
self.__fileName = fileName
def run(self):
if not sizeInBytes:
print "Size cannot be determined."
return
splits = arange(self.__spl + 1) * (float(sizeInBytes)/self.__spl)
for idx in range(self.__spl):
req = urllib2.Request(self.__url, headers={'Range': 'bytes=%d-%d' % (splits[idx], splits[idx+1])})
self.__dataLst.append(urllib2.urlopen(req).read())
def getFileData(self):
return ''.join(self.__dataLst)
fileName = url.split('/')[-1]
dl = ThreadedFetch(url, fileName)
dl.start()
dl.join()
content = dl.getFileData()
if content:
with open(fileName, 'w') as fh:
fh.write(content)
print "Finished Writing file %s" % fileName
Below is how the image after getting downloaded.
Here's another version of the project. Differences:
thread code is a single small function
each thread downloads a chunk, then stores it in a global threadsafe dictionary
threads are started, then join()ed -- they're all running at once
when all done, data is reassembled in correct order then written to disk
extra printing, to verify everything's correct
output file size is calculated, for an extra comparison
source
import os, requests
import threading
import urllib2
import time
URL = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=3):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = url.split('/')[-1]
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
req = urllib2.Request(url)
req.headers['Range'] = 'bytes={}'.format(irange)
dataDict[idx] = urllib2.urlopen(req).read()
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
for th in downloaders:
th.join()
print 'done: got {} chunks, total {} bytes'.format(
len(dataDict), sum( (
len(chunk) for chunk in dataDict.values()
) )
)
print "--- %s seconds ---" % str(time.time() - start_time)
if os.path.exists(fileName):
os.remove(fileName)
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
if __name__ == '__main__':
main(URL)
output
102331 bytes to download.
done: got 3 chunks, total 102331 bytes
--- 0.380599021912 seconds ---
Finished Writing file 607800main_kepler1200_1600-1200.jpg
file size 102331 bytes
Here is how I got it working if anyone got any suggestion for possible improvement, you are most welcome.
import os
import requests
import threading
import urllib2
import time
url = "http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg"
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
class SplitBufferThreads(threading.Thread):
""" Splits the buffer to ny number of threads
thereby, concurrently downloading through
ny number of threads.
"""
def __init__(self, url, byteRange):
super(SplitBufferThreads, self).__init__()
self.__url = url
self.__byteRange = byteRange
self.req = None
def run(self):
self.req = urllib2.Request(self.__url, headers={'Range': 'bytes=%s' % self.__byteRange})
def getFileData(self):
return urllib2.urlopen(self.req).read()
def main(url=None, splitBy=3):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = url.split('/')[-1]
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataLst = []
for idx in range(splitBy):
byteRange = buildRange(int(sizeInBytes), splitBy)[idx]
bufTh = SplitBufferThreads(url, byteRange)
bufTh.start()
bufTh.join()
dataLst.append(bufTh.getFileData())
content = ''.join(dataLst)
if dataLst:
if os.path.exists(fileName):
os.remove(fileName)
print "--- %s seconds ---" % str(time.time() - start_time)
with open(fileName, 'w') as fh:
fh.write(content)
print "Finished Writing file %s" % fileName
if __name__ == '__main__':
main(url)
this is the first bare bone code I have got working, I discovered if I set bufTh buffer thread to Daemon False then process takes more time to finish.

HTTP requests on localhost via python urllib and urllib2 are very slow

I have written a simple class using urllib and urllib2 to send http requests and get the response. However, any request to localhost using its IP Address is very slow.
IP Address of LOCALHOST is = 192.168.158.27
import urllib2,urllib,re,datetime,time
class HTTPRequest():
def __init__(self,**kargs):
self._response = None
self._buffer = None
self._conn = urllib2.build_opener(urllib2.HTTPCookieProcessor())
urllib2.install_opener(self._conn)
def _encode_url(self,**kargs):
try:
params = urllib.urlencode(kargs)
except:
raise HTTPError("Failed to encode URL parameters..")
return str(params)
def _request(self,url=None,params=None):
try:
self._buffer = self._conn.open(url,params)
self._response = self._buffer.read()
except ValueError:
raise HTTPError("Invalid URL %s" % url)
except:
raise HTTPError("Failed to send HTTP(s) Request")
return str(self._response)
class HTTPError(Exception):
pass
PARAM_PASSWORD = 'password'
PARAM_USER = 'userName'
PARAM_ACTION = 'a'
PARAM_RID = 'rid'
PARAM_XO = 'xo'
PARAM_START_TIME = 't1'
PARAM_END_TIME = 't2'
PARAM_PATH = 'path'
BOOLEAN_TRUE = 'true'
BOOLEAN_FALSE = 'false'
ACTION_SIGNIN = 'signIn'
ACTION_SEARCH = 'search'
ACTION_GET_NEXT_RESULTS = 'getNextResults'
STATUS_SUCCEEDED = 'succeeded'
DEFAULT_WAIT = 5
host = "192.168.158.27"
user = "admin"
password = "admin"
protocol = "https"
port = 8443
query = "vm[=name rx (?i) *]&[#cpuUsage rx b .+][#cpuUsagemhz rx b .+]"
start_time = "10/05/2013 16:16:00"
end_time = "10/05/2013 17:16:00"
base_url = "%s://%s:%d" % (protocol,host,port)
login_url = "%s/user" % base_url
http = HTTPRequest()
attributes = {PARAM_PASSWORD : password,
PARAM_USER : user,
PARAM_ACTION : ACTION_SIGNIN,
PARAM_RID : 1000,
PARAM_XO : BOOLEAN_TRUE}
params = http._encode_url(**attributes)
if not http._request(login_url,params):
print "Login Failed.."
else:
print "Login Successful.. \n"
rid = 1000
search_url = "%s/Search" % base_url
status = STATUS_SUCCEEDED
hasMoreData = BOOLEAN_TRUE
completed = BOOLEAN_FALSE
total = 0
processed = 1
responseContent = ""
xml_dict = {}
_response = ""
attributes = {PARAM_START_TIME : start_time,
PARAM_END_TIME : end_time,
PARAM_ACTION : ACTION_SEARCH,
PARAM_RID : rid,
PARAM_PATH : query}
print "URL PARAMETERS :"
print "\tBase url = %s" % base_url
for param in attributes:
print "\t%s = %s" % (param,attributes[param])
#Query Execution Start Time
start = datetime.datetime.now()
while True:
params = http._encode_url(**attributes)
if hasMoreData == BOOLEAN_TRUE:
#Delay 10ms
time.sleep(10/1000)
#Send HTTP Request
response = http._request(search_url,params)
pattern = re.match(".*?hasMoreData=\"(.*?)\".*?",response)
if pattern:
hasMoreData = pattern.group(1)
pattern = re.match(".*?status=\"(.*?)\".*?",response)
if pattern:
status = pattern.group(1)
pattern = re.match(".*?completed=\"(.*?)\".*?",response)
if pattern:
completed = pattern.group(1)
pattern = re.match(".*?processed=\"(.*?)\".*?",response)
if pattern:
processed = pattern.group(1)
pattern = re.match(".*?total=\"(.*?)\".*?",response)
if pattern:
total = pattern.group(1)
pattern = re.match(".*?matched=\"(.*?)\".*?",response)
if pattern:
matched = pattern.group(1)
attributes = {PARAM_ACTION : ACTION_GET_NEXT_RESULTS,
PARAM_RID : rid}
if matched != "0":
response = re.sub(r'\n',"",response)
matchObj = re.search(r'(<Resource.*</Resource>)',response)
resp_match = ""
if matchObj:
resp_match = matchObj.group(1)
responseContent = str(responseContent) + str(resp_match)
else:
#Query Execution Completed
#Query Execution End Time
end = datetime.datetime.now()
print "RESULTS : "
print "\tStatus = %s"%status
print "\tHas More Data = %s"%hasMoreData
print "\tCompleted = %s"%completed
print "\tProcessed = %s"%processed
print "\tTotal = %s"%total
print "\tMatched = %s"%matched
print "\nQuery Execution Started : %s" % start
print "Query Execution Ended : %s\n" % end
if total != processed:
err = "The number records processed did not match"
err += " with the number of records completed."
print err
if not status == STATUS_SUCCEEDED:
err = "The responce status is not 'succeeded'"
print err
if completed == BOOLEAN_FALSE:
err = "The repsponse is completed. "
err += "However, the flag is set to 'False'"
print err
break
Instead of your local network IP, try using 127.0.0.1 instead.

python calling apis events and printing randomly

I have a interval of 60 and wanted to print 6 events every 1 minutes. But it prints 11,12 and 13 events randomly every 1 minutes. Why is that so ? Is it because of my codes or what other factors can cause this ?
My code is -
import logging
import httplib
import simplejson as json
import socket
import time
import datetime
import urllib2
import sys
import xml.dom.minidom
from bs4 import BeautifulSoup as soup
SCHEME = """<scheme>
<title>testingCurrentWeatherSG</title>
<description>Get data from forecast.</description>
<use_external_validation>true</use_external_validation>
<streaming_mode>simple</streaming_mode>
<endpoint>
<args>
<arg name="intervalone">
<title>Intervalone</title>
<description>How long to refresh this query?</description>
</arg>
</args>
</endpoint>
</scheme>
"""
def do_scheme():
print SCHEME
## Utility functions
def fahrenheit(fahren):
return (fahren-32) * 5.0/9.0
def get_percent(num):
return num * 100.
## Responses
def get_response(conn, url):
try:
conn.request('GET', url)
result = conn.getresponse()
data = result.read()
return json.loads(data)
except socket.timeout:
return None
## Printing
def print_forecast(name, di):
# Print the forcast from 'di', for location 'name'
# name is the name of the location, di is the api response
psi_avg=20
current = di['currently']
for key, value in sorted(current.iteritems()):
if key in ['cloudCover', 'icon', 'ozone', 'precipIntensity', # time
'precipProbability', 'precipType', 'pressure', 'summary',
'visibility', 'windBearing', 'windSpeed']:
print '{0} : {1}'.format(key, value)
elif key in ['temperature', 'dewPoint']:
print '%s: %.2f' % (key, fahrenheit(value))
elif key == 'humidity':
print '%s: %.2f' % (key, get_percent(value))
print 'psiAverage : ' + str(psi_avg)
print 'latitude : ' + str(di['latitude'])
print 'longitude : ' + str(di['longitude'])
print 'location : ' + str(name)
print
def weather_Connection(intervalone):
host = 'api.forecast.io'
conn = httplib.HTTPSConnection(host, timeout=60) # adjust timeout as desired
try:
urlnyp = '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.37871,103.848808'
conn.request('GET', urlnyp)
resultnyp = conn.getresponse()
contentnyp = resultnyp.read()
except socket.timeout:
print 'socket timeout'
return
# the locations and urls for the api calls
urls = {
'Choa Chu Kang': '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.394557,103.746396',
'Kallang': '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.311469,103.871399',
'Jurong West': '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.352008,103.698599',
'Redhill': '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.289732,103.81675',
'Tampines': '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.353092,103.945229',
'Yishun': '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.429463,103.84022',
}
responses = {}
for i, (name, url) in enumerate(sorted(urls.iteritems())):
response = get_response(conn, url)
if not response:
print 'socket timeout on url#%d: %s' % (i, url)
return
responses[name] = response
conn.close()
# print the forecast
for name, data in responses.iteritems():
print_forecast(name, data)
def get_config():
#Read XML Configuration data passed from splunkd on stdin
config = {}
try:
# read everything from stdin
config_str = sys.stdin.read()
# parse the config XML
doc = xml.dom.minidom.parseString(config_str)
root = doc.documentElement
conf_node = root.getElementsByTagName("configuration")[0]
if conf_node:
logging.debug("XML: found configuration")
stanza = conf_node.getElementsByTagName("stanza")[0]
if stanza:
stanza_name = stanza.getAttribute("name")
if stanza_name:
logging.debug("XML: found stanza " + stanza_name)
config["name"] = stanza_name
params = stanza.getElementsByTagName("param")
for param in params:
param_name = param.getAttribute("name")
logging.debug("XML: found param '%s'" % param_name)
if param_name and param.firstChild and \
param.firstChild.nodeType == param.firstChild.TEXT_NODE:
data = param.firstChild.data
config[param_name] = data
logging.debug("XML: '%s' -> '%s'" % (param_name, data))
if not config:
raise Exception, "Invalid configuration received from Splunk."
except Exception, e:
raise Exception, "Error getting Splunk configuration via STDIN: %s" % str(e)
return config
def run():
#The Main function that starts the action. The thread will sleep for however many seconds are configured via the Input.
# config = get_config()
#
#
# intervalone = config["intervalone"]
intervalone =60
while True:
weather_Connection(intervalone)
logging.info("Sleeping for %s seconds" %(intervalone))
time.sleep(float(intervalone))
if __name__ == '__main__':
if len(sys.argv) > 1:
if sys.argv[1] == "--scheme":
do_scheme()
else:
run()
sys.exit(0)
I've checked and tried your code and it works fine. Try replacing
logging.info("Sleeping for %s seconds" %(intervalone))
with
print("Sleeping for %s seconds" % (intervalone))
You should see this statement every 6 forecasts.
Note: why returning from weather_Connection() here
for i, (name, url) in enumerate(sorted(urls.iteritems())):
response = get_response(conn, url)
if not response:
print 'socket timeout on url#%d: %s' % (i, url)
return
responses[name] = response
You can just skip it with continue
for i, (name, url) in enumerate(sorted(urls.iteritems())):
response = get_response(conn, url)
if not response:
print 'socket timeout on url#%d: %s' % (i, url)
continue
responses[name] = response

Categories

Resources