I've recently been trying to scrape a site that contains chemistry exam tests in pdf using Python. I used requests for python and everything was going well, until some of the downloads were cut short at a very small size i.e. 2KB. What's curious though - it happens completely at random with every run of the script the files cut are different. I've been scratching my head for a while now and decided to ask here. Downloading them manually probably would have proved faster by now, but I want to know why the script isn't working, for future reference.
I've written the script to be asynchronous, thus it occurred to me that I could have been DoSing the server. However, I've replaced every Pool with a synchronous for loop, even adding time.sleep() here and there - it didn't help. Using this approach none of the files were fully downloaded - practically every single one stopping at 2KB.
Please forgive me if the question is naive or my mistake is foolish as I am only a hobby programmer. I'll be grateful for any help.
P.S. I've intercepted the headers using Postman from Chrome, without them the response was 500, however I won't include them as they contain session ids that would enable you to login into my account.
The script is as follows:
from shutil import copyfileobj
from multiprocessing.dummy import Pool as ThreadPool
from requests import get
from time import sleep
titles = {
"95": "Budowa atomu. Układ okresowy pierwiastków chemicznych",
"96": "Wiązania chemiczne",
"97": "Systematyka związków nieorganicznych",
"98": "Stechiometria",
"99": "Reakcje utleniania-redukcji. Elektrochemia",
"100": "Roztwory",
"101": "Kinetyka chemiczna",
"102": "Reakcje w wodnych roztworach elektrolitów",
"103": "Charakterystyka pierwiastków i związków chemicznych",
"104": "Chemia organiczna jako chemia związków węgla",
"105": "Węglowodory",
"106": "Jednofunkcyjne pochodne węglowodorów",
"107": "Wielofunkcyjne pochodne węglowodorów",
"108": "Arkusz maturalny"
}
#collection = {"120235": "Chemia nieorganiczna", "120586": "Chemia organiczna"}
url = "https://e-testy.terazmatura.pl/print/%s/quiz_%s/%s"
def downloadTest(id):
with ThreadPool(2) as tp:
tp.starmap(downloadActualTest, [(id, "blank"), (id, "key")])
def downloadActualTest(id, dataType):
name = titles[str(id)]
if id in range(95, 104):
collectionId = 120235
else:
collectionId = 120586
if dataType == "blank":
with open("Pulled Data/%s - pusty.pdf" % name, "wb") as test:
print("Downloading: " + url % (collectionId, id, "blank") + '\n')
r = get(url % (collectionId, id, "blank"),
stream=True,
headers=headers)
r.raw.decode_content = True
copyfileobj(r.raw, test)
elif dataType == "key":
with open("Pulled Data/%s - klucz.pdf" % name, "wb") as test:
print("Downloading: " + url % (collectionId, id, "key") + '\n')
r = get(url % (collectionId, id, "key"),
stream=True,
headers=headers)
r.raw.decode_content = True
copyfileobj(r.raw, test)
with ThreadPool(3) as p:
p.map(downloadTest, range(95, 109))
Related
there is about 70% chance shows error:
res=pool.map(feng,urls)
File "c:\Python27\lib\multiprocessing\pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "c:\Python27\lib\multiprocessing\pool.py", line 567, in get
raise self._value
IndexError: list index out of range
don't know why,if data less then 100,only 5%chance show that message.any one have idea how to improve?
#coding:utf-8
import multiprocessing
import requests
import bs4
import re
import string
root_url = 'http://www.haoshiwen.org'
#index_url = root_url+'/type.php?c=1'
def xianqin_url():
f = 0
h = 0
x = 0
y = 0
b = []
l=[]
for i in range(1,64):#页数
index_url=root_url+'/type.php?c=1'+'&page='+"%s" % i
response = requests.get(index_url)
soup = bs4.BeautifulSoup(response.text,"html.parser")
x = [a.attrs.get('href') for a in soup.select('div.sons a[href^=/]')]#取出每一页的div是sons的链接
c=len(x)#一共c个链接
j=0
for j in range(c):
url = root_url+x[j]
us = str(url)
print "收集到%s" % us
l.append(url) #pool = multiprocessing.Pool(8)
return l
def feng (url) :
response = requests.get(url)
response.encoding='utf-8'
#print response.text
soup = bs4.BeautifulSoup(response.text, "html.parser")
#content = soup.select('div.shileft')
qq=str(soup)
soupout = re.findall(r"原文(.+?)</div>",qq,re.S)#以“原文”开头<div>结尾的字段
#print soupout[1]
content=str(soupout[1])
b="风"
cc=content.count(b,0,len(content))
return cc
def start_process():
print 'Starting',multiprocessing.current_process().name
def feng (url) :
response = requests.get(url)
response.encoding='utf-8'
#print response.text
soup = bs4.BeautifulSoup(response.text, "html.parser")
#content = soup.select('div.shileft')
qq=str(soup)
soupout = re.findall(r"原文(.+?)</div>",qq,re.S)#以“原文”开头<div>结尾的字段
#print soupout[1]
content=str(soupout[1])
b="风"
c="花"
d="雪"
e="月"
f=content.count(b,0,len(content))
h=content.count(c,0,len(content))
x=content.count(d,0,len(content))
y=content.count(e,0,len(content))
return f,h,x,y
def find(urls):
r= [0,0,0,0]
pool=multiprocessing.Pool()
res=pool.map4(feng,urls)
for i in range(len(res)):
r=map(lambda (a,b):a+b, zip(r,res[i]))
return r
if __name__=="__main__":
print "开始收集网址"
qurls=xianqin_url()
print "收集到%s个链接" % len(qurls)
print "开始匹配先秦诗文"
find(qurls)
print '''
%s篇先秦文章中:
---------------------------
风有:%s
花有:%s
雪有:%s
月有:%s
数据来源:%s
''' % (len(qurls),find(qurls)[0],find(qurls)[1],find(qurls)[2],find(qurls)[3],root_url)
stackoverflow :Body cannot contain "`pool ma p".
changed it as res=pool.map4(feng,urls)
i'm trying to get some sub string from this website,with multiprocessing.
Indeed, multiprocessing makes it a bit hard to debug as you don't see where the index out of bound error occurred (the error message makes it appear as if it happened internally in the multiprocessing module).
In some cases this line:
content=str(soupout[1])
raises an index out of bound, because soupout is an empty list. If you change it to
if len(soupout) == 0:
return None
and then remove the None that were returned by changing
res=pool.map(feng,urls)
into
res = pool.map(feng,urls)
res = [r for r in res if r is not None]
then you can avoid the error. That said. You probably want to find out the root cause why re.findall returned an empty list. It is certainly a better idea to select the node with beatifulsoup than with regex, as generally matching with bs4 is more stable, especially if the website slightly changes their markup (e.g. whitespaces, etc.)
Update:
why is soupout is an empty list? When I didn't use pool.map never I have this error message shown
This is probably because you hammer the web server too fast. In a comment you mention that you sometimes get 504 in response.status_code. 504 means Gateway Time-out: The server was acting as a gateway or proxy and did not receive a timely response from the upstream server
This is because haoshiwen.org seems to be powered by kangle which is a reverse proxy. Now the reverse proxy handles back all the requests you send him to the web server behind, and if you now start too many processes at once the poor web server cannot handle the flood. Kangle has a default timeout of 60s so as soon as he doesn't get an answer back from the web server within 60s he shows the error you posted.
How do you fix that?
you could limit the number of processes: pool=multiprocessing.Pool(2), you'd need to play around with a good number of processes
at the top of feng(url) you could add a time.sleep(5) so each process waits 5 seconds between each request. Also here you'd need to play around with the sleep time.
So here's the problem, our security teacher made a site that requires authentification and then asks for a code (4 characters) so that you can access to a file. He told us to write a brute force program in Python (any library we want) that can find the password. So to do that I wanted first to make a program that can try random combinations on that code field just to have an idea about the time of each request ( I'm using requests library) and the result was disapointing each request takes around 8 secs.
With some calculations: 4^36=13 436 928 possible combination that would take my program around 155.52 days.
I would really apreciate if any one can help me out to make that faster. ( he told us that it is possible to make around 1200 combinations per sec)
Here's my code:
import requests
import time
import random
def gen():
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789"
pw_length = 4
mypw = ""
for i in range(pw_length):
next_index = random.randrange(len(alphabet))
mypw = mypw + alphabet[next_index]
return mypw
t0 = time.clock()
t1 = time.time()
cookie = {'ig': 'b0b5294376ef12a219147211fc33d7bb'}
for i in range(0,5):
t2 = time.clock()
t3 = time.time()
values = {'RECALL':gen()}
r = requests.post('http://www.example.com/verif.php', stream=True, cookies=cookie, data=values)
print("##################################")
print("cpu time for req ",i,":", time.clock()-t2)
print("wall time for req ",i,":", time.time()-t3)
print("##################################")
print("##################################")
print("Total cpu time:", time.clock()-t0)
print("Total wall time:", time.time()-t1)
Thank you
A thing you could try is to use a Pool of workers to do multiple requests in parallel passing a password to each worker. Something like:
import itertools
from multiprocessing import Pool
def pass_generator():
for pass_tuple in itertools.product(alphabet, repeat=4):
yield ''.join(pass_tuple)
def check_password(password):
values = {'RECALL': password}
r = requests.post('http://www.example.com/verif.php', stream=True, cookies=cookie, data=values)
# Check response here.
pool = Pool(processes=NUMBER_OF_PROCESSES)
pool.map(check_password, pass_generator())
I am trying to make a python script that downloads just the metadata of a torrent given an infohash. This infohash is loaded from a json file, with the contents like such:
{"infohash":"someinfohash"}
If I manually encode the infohash into the link string, or I make it personally using a dictionary, like this:
link = 'magnet:?xt=urn:btih:someinfohash'
or like this:
foo = {}
foo['infohash'] = 'someinfohash'
link = 'magnet:?xt=urn:btih:' + foo['infohash']
I can always download the metadata, no problem. But for some reason, when I load it from a json file, it consistently times out.
thedata = open(sys.argv[1]).read()
thedata = json.loads(thedata)
ses = lt.session()
ses.listen_on(6881, 6891)
params = {
'save_path': '.', # doesn't matter because we're only downloading metadata
}
link = 'magnet:?xt=urn:btih:' + thedata['infohash']
handle = lt.add_magnet_uri(ses, link, params)
ses.add_dht_router('dht.transmissionbt.com', 6881)
ses.add_dht_router('router.bittorrent.com', 6881)
ses.add_dht_router('router.utorrent.com', 6881)
ses.start_dht()
sys.stdout.write('Downloading metadata...')
sys.stdout.flush()
timeout = time.time()
while (not handle.has_metadata()):
if (time.time() >= 300 + timeout):
print 'timed out'
sys.exit(1)
time.sleep(1)
print 'done'
ses.pause()
If I check if the strings are equal, like so:
link = 'magnet:?xt=urn:btih:' + thedata['infohash']
link2 = 'magnet:?xt=urn:btih:someinfohash'
print link == link2
It prints true.
Does anybody have any idea what is going on?
I finally found what was wrong with it.
I got the idea to examine the programmer-friendly text representation of the json-generated dictionary as opposed to my one made manually.
thedata = open(sys.argv[1]).read()
thedata = json.loads(thedata)
print thedata
thedata = {}
thedata['infohash'] = 'someinfohash'
print thedata
What came up was this:
{u'infohash', u'someinfohash'}
{'infohash', 'someinfohash'}
Apparently the json-made dict was encoded in unicode, and this somehow prevented libtorrent from connecting to seeders. After un-unicoding all the keys and values, the script runs perfectly.
EDIT:(SOLVED) When I am reading the values in from my file a newline char is getting added onto the end.(\n) this is splitting my request string at that point.
I think it's to do with how I saved the values to the file in the first place. Many thanks.
I have I have the following code:
results = 'http://www.myurl.com/'+str(mystring)
print str(results)
request = urllib2.Request(results)
request.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)')
opener = urllib2.build_opener()
text = opener.open(request).read()
Which is in a loop.
after the loop has run a few times str(mystring) changes to give a different set of results.
I can loop the script as many times as I like keeping the value of str(mystring) constant but every time I change the value of str(mystring) I get an error saying no host given when the code tries to build the opener.
opener = urllib2.build_opener()
Can anyone help please?
TIA,
Paul.
EDIT:
More code here.....
import sys
import string
import httplib
import urllib2
import re
import random
import time
def StripTags(text):
finished = 0
while not finished:
finished = 1
start = text.find("<")
if start >= 0:
stop = text[start:].find(">")
if stop >= 0:
text = text[:start] + text[start+stop+1:]
finished = 0
return text
mystring="test"
d={}
with open("myfile","r") as f:
while True:
page_counter=0
print str(mystring)
try:
while page_counter <20:
results = 'http://www.myurl.com/'+str(mystring)
print str(results)
request = urllib2.Request(results)
request.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)')
opener = urllib2.build_opener()
text = opener.open(request).read()
finds = (re.findall('([\w\.\-]+'+mystring+')',StripTags(text)))
for find in finds:
d[find]=1
uniq_emails=d.keys()
page_counter = page_counter +1
print "found this " +str(finds)"
random.seed()
n = random.random()
i = n * 5
print "Pausing script for " + str(i) + " Seconds" + ""
time.sleep(i)
mystring=next(f)
except IOError:
print "No result found!"+""
I found the answer. It's as follows....
The values for mystring were read in from a file.
In the script I wrote to write the file I opens it with "w" instead of "wb".
Each line in the file ended with a newline character "/n".
When mystring was added to the string request the new line was being created in the middle of the request string.[1]
This would never have been apparent from my code because I changed it to post here in an effort to hide the real url I am using to get my results.[2]
My actual url looks more like this.....
Myurl.com/mystring/otherstuff/page_counter/morestuff.htm
The /n being read from the file spliced my url and gave urllib problems......
[1] I use windows. It adds lots of unseen things to text files. If I'd opened the file to write to with "wb" instead of "w" the contents would have been written without the unseen /n
[2] always post your full code kids. The good people of stackoverflow can't help you unless they can see what you are doing.....
Many thanks all, I hope this helps someone out at some point.
Paul.
In the while loop, you're setting results to something which is not a url:
results = 'myurl+str(mystring)'
It should probably be results = myurl+str(mystring)
By the way, it appears there's no need for all the casting to string (str()) you do:
(expanded on request)
print str(foo): in such a case, str() is never necessary. Python will always print foo's string representation
results = 'http://www.myurl.com/'+str(mystring). This is also unnecessary; mystring is already a string, so 'http://www.myurl.com/' + mystring would suffice.
print "Pausing script for " + str(i) + " Seconds". Here you would get an error without str() since you can't do string + int. However, print "foo", 1, "bar" does work. As do print "foo %i bar" % 1 and print "foo {0} bar".format(1) (see here)
I have a CSV with keywords in one column and the number of impressions in a second column.
I'd like to provide the keywords in a url (while looping) and for the Google language api to return what type of language was the keyword in.
I have it working manually. If I enter (with the correct api key):
http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=merde
I get:
{"responseData": {"language":"fr","isReliable":false,"confidence":6.213709E-4}, "responseDetails": null, "responseStatus": 200}
which is correct, 'merde' is French.
so far I have this code but I keep getting server unreachable errors:
import time
import csv
from operator import itemgetter
import sys
import fileinput
import urllib2
import json
E_OPERATION_ERROR = 1
E_INVALID_PARAMS = 2
#not working
def parse_result(result):
"""Parse a JSONP result string and return a list of terms"""
# Deserialize JSON to Python objects
result_object = json.loads(result)
#Get the rows in the table, then get the second column's value
# for each row
return row in result_object
#not working
def retrieve_terms(seedterm):
print(seedterm)
"""Retrieves and parses data and returns a list of terms"""
url_template = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=%(seed)s'
url = url_template % {"seed": seedterm}
try:
with urllib2.urlopen(url) as data:
data = perform_request(seedterm)
result = data.read()
except:
sys.stderr.write('%s\n' % 'Could not request data from server')
exit(E_OPERATION_ERROR)
#terms = parse_result(result)
#print terms
print result
def main(argv):
filename = argv[1]
csvfile = open(filename, 'r')
csvreader = csv.DictReader(csvfile)
rows = []
for row in csvreader:
rows.append(row)
sortedrows = sorted(rows, key=itemgetter('impressions'), reverse = True)
keys = sortedrows[0].keys()
for item in sortedrows:
retrieve_terms(item['keywords'])
try:
outputfile = open('Output_%s.csv' % (filename),'w')
except IOError:
print("The file is active in another program - close it first!")
sys.exit()
dict_writer = csv.DictWriter(outputfile, keys, lineterminator='\n')
dict_writer.writer.writerow(keys)
dict_writer.writerows(sortedrows)
outputfile.close()
print("File is Done!! Check your folder")
if __name__ == '__main__':
start_time = time.clock()
main(sys.argv)
print("\n")
print time.clock() - start_time, "seconds for script time"
Any idea how to finish the code so that it will work? Thank you!
Try to add referrer, userip as described in the docs:
An area to pay special attention to
relates to correctly identifying
yourself in your requests.
Applications MUST always include a
valid and accurate http referer header
in their requests. In addition, we
ask, but do not require, that each
request contains a valid API Key. By
providing a key, your application
provides us with a secondary
identification mechanism that is
useful should we need to contact you
in order to correct any problems. Read
more about the usefulness of having an
API key
Developers are also encouraged to make
use of the userip parameter (see
below) to supply the IP address of the
end-user on whose behalf you are
making the API request. Doing so will
help distinguish this legitimate
server-side traffic from traffic which
doesn't come from an end-user.
Here's an example based on the answer to the question "access to google with python":
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import urllib, urllib2
from pprint import pprint
api_key, userip = None, None
query = {'q' : 'матрёшка'}
referrer = "https://stackoverflow.com/q/4309599/4279"
if userip:
query.update(userip=userip)
if api_key:
query.update(key=api_key)
url = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&%s' %(
urllib.urlencode(query))
request = urllib2.Request(url, headers=dict(Referer=referrer))
json_data = json.load(urllib2.urlopen(request))
pprint(json_data['responseData'])
Output
{u'confidence': 0.070496580000000003, u'isReliable': False, u'language': u'ru'}
Another issue might be that seedterm is not properly quoted:
if isinstance(seedterm, unicode):
value = seedterm
else: # bytes
value = seedterm.decode(put_encoding_here)
url = 'http://...q=%s' % urllib.quote_plus(value.encode('utf-8'))