python request urls parallel [duplicate] - python

This question already has an answer here:
How to send multiple http requests python
(1 answer)
Closed 6 years ago.
I created the following script to download images from an API endpoint which works as intended. Thing is that it is rather slow as all the requests have to wait on each other. What is the correct way to make it possible to still have the steps synchronously for each item I want to fetch, but make it parallel for each individual item. This from an online service called
servicem8
So what I hope to achieve is:
fetch all possible job ids => keep name/and other info
fetch name of the customer
fetch each attachment of a job
These three steps should be done for each job. So I could make things parallel for each job as they do not have to wait on each other.
Update:
Problem I do not understand is how can you make sure that you bundle for example the three calls per item in one call as its only per item that I can do things in parallel so for example when I want to
fetch item( fetch name => fetch description => fetch id)
so its the fetch item I want to make parallel?
The current code I have is working but rather slow:
import requests
import dateutil.parser
import shutil
import os
user = "test#test.com"
passw = "test"
print("Read json")
url = "https://api.servicem8.com/api_1.0/job.json"
r = requests.get(url, auth=(user, passw))
print("finished reading jobs.json file")
scheduled_jobs = []
if r.status_code == 200:
for item in r.json():
scheduled_date = item['job_is_scheduled_until_stamp']
try:
parsed_date = dateutil.parser.parse(scheduled_date)
if parsed_date.year == 2016:
if parsed_date.month == 10:
if parsed_date.day == 10:
url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item[
'company_uuid'])
c = requests.get(url_customer, auth=(user, passw))
cus_name = c.json()['name']
scheduled_jobs.append(
[item['uuid'], item['generated_job_id'], cus_name])
except ValueError:
pass
for job in scheduled_jobs:
print("fetch for job {}".format(job))
url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[
0])
r = requests.get(url, auth=(user, passw))
if r.json() == []:
pass
for attachment in r.json():
if attachment['active'] == 1 and attachment['file_type'] != '.pdf':
print("fetch for attachment {}".format(attachment))
url_staff = "https://api.servicem8.com/api_1.0/Staff.json?%24filter=uuid%20eq%20{}".format(
attachment['created_by_staff_uuid'])
s = requests.get(url_staff, auth=(user, passw))
for staff in s.json():
tech = "{}_{}".format(staff['first'], staff['last'])
url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
'uuid'])
r = requests.get(url, auth=(user, passw), stream=True)
if r.status_code == 200:
creation_date = dateutil.parser.parse(
attachment['timestamp']).strftime("%d.%m.%y")
if not os.path.exists(os.getcwd() + "/{}/{}".format(job[2], job[1])):
os.makedirs(os.getcwd() + "/{}/{}".format(job[2], job[1]))
path = os.getcwd() + "/{}/{}/SC -O {} {}{}".format(
job[2], job[1], creation_date, tech.upper(), attachment['file_type'])
print("writing file to path {}".format(path))
with open(path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print(r.text)
Update [14/10]
I updated the code in the following way with some hints given. Thanks a lot for that. Only thing I could optimize I guess is the attachment downloading but it is working fine now. Funny thing I learned is that you cannot create a CON folder on a windows machine :-) did not know that.
I use pandas as well just to try to avoid some loops in my list of dicts but not sure if I am already most performant. Longest is actually reading in the full json files. I fully read them in as I could not find an API way of just telling the api, return me only the jobs from september 2016. The api query function seems to work on eq/lt/ht.
import requests
import dateutil.parser
import shutil
import os
import pandas as pd
user = ""
passw = ""
FOLDER = os.getcwd()
headers = {"Accept-Encoding": "gzip, deflate"}
import grequests
urls = [
'https://api.servicem8.com/api_1.0/job.json',
'https://api.servicem8.com/api_1.0/Attachment.json',
'https://api.servicem8.com/api_1.0/Staff.json',
'https://api.servicem8.com/api_1.0/Company.json'
]
#Create a set of unsent Requests:
print("Read json files")
rs = (grequests.get(u, auth=(user, passw), headers=headers) for u in urls)
#Send them all at the same time:
jobs,attachments,staffs,companies = grequests.map(rs)
#create dataframes
df_jobs = pd.DataFrame(jobs.json())
df_attachments = pd.DataFrame(attachments.json())
df_staffs = pd.DataFrame(staffs.json())
df_companies = pd.DataFrame(companies.json())
#url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item['company_uuid'])
#c = requests.get(url_customer, auth=(user, passw))
#url = "https://api.servicem8.com/api_1.0/job.json"
#jobs = requests.get(url, auth=(user, passw), headers=headers)
#print("Reading attachments json")
#url = "https://api.servicem8.com/api_1.0/Attachment.json"
#attachments = requests.get(url, auth=(user, passw), headers=headers)
#print("Reading staff.json")
#url_staff = "https://api.servicem8.com/api_1.0/Staff.json"
#staffs = requests.get(url_staff, auth=(user, passw))
scheduled_jobs = []
if jobs.status_code == 200:
print("finished reading json file")
for job in jobs.json():
scheduled_date = job['job_is_scheduled_until_stamp']
try:
parsed_date = dateutil.parser.parse(scheduled_date)
if parsed_date.year == 2016:
if parsed_date.month == 9:
cus_name = df_companies[df_companies.uuid == job['company_uuid']].iloc[0]['name'].upper()
cus_name = cus_name.replace('/', '')
scheduled_jobs.append([job['uuid'], job['generated_job_id'], cus_name])
except ValueError:
pass
print("{} jobs to fetch".format(len(scheduled_jobs)))
for job in scheduled_jobs:
print("fetch for job attachments {}".format(job))
#url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[0])
if attachments == []:
pass
for attachment in attachments.json():
if attachment['related_object_uuid'] == job[0]:
if attachment['active'] == 1 and attachment['file_type'] != '.pdf' and attachment['attachment_source'] != 'INVOICE_SIGNOFF':
for staff in staffs.json():
if staff['uuid'] == attachment['created_by_staff_uuid']:
tech = "{}_{}".format(
staff['first'].split()[-1].strip(), staff['last'])
creation_timestamp = dateutil.parser.parse(
attachment['timestamp'])
creation_date = creation_timestamp.strftime("%d.%m.%y")
creation_time = creation_timestamp.strftime("%H_%M_%S")
path = FOLDER + "/{}/{}/SC_-O_D{}_T{}_{}{}".format(
job[2], job[1], creation_date, creation_time, tech.upper(), attachment['file_type'])
# fetch attachment
if not os.path.isfile(path):
url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
'uuid'])
r = requests.get(url, auth=(user, passw), stream = True)
if r.status_code == 200:
if not os.path.exists(FOLDER + "/{}/{}".format(job[2], job[1])):
os.makedirs(
FOLDER + "/{}/{}".format(job[2], job[1]))
print("writing file to path {}".format(path))
with open(path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print("file already exists")
else:
print(r.text)

General idea is to use asynchronous url requests and there is a python module named grequests for that-https://github.com/kennethreitz/grequests
From Documentation:
import grequests
urls = [
'http://www.heroku.com',
'http://python-tablib.org',
'http://httpbin.org',
'http://python-requests.org',
'http://fakedomain/',
'http://kennethreitz.com'
]
#Create a set of unsent Requests:
rs = (grequests.get(u) for u in urls)
#Send them all at the same time:
grequests.map(rs)
And the resopnse
[<Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, None, <Response [200]>]

Related

Error: proxy = next(proxy_pool) StopIteration

I am trying to run a script and it has a standard URL for proxies which allows the script to run fine. Once I add my own proxy URL I am getting the error Error: proxy = next(proxy_pool) StopIteration. My URL is in another file and I can also link that if needed.
Code is below, if anyone can help that would be great.
import string
import os
import requests
import proxygen
from itertools import cycle
import base64
from random import randint
N = input("How many tokens : ")
count = 0
current_path = os.path.dirname(os.path.realpath(__file__))
url = "https://discordapp.com/api/v6/users/#me/library"
while(int(count) < int(N)):
tokens = []
base64_string = "=="
while(base64_string.find("==") != -1):
sample_string = str(randint(000000000000000000, 999999999999999999))
sample_string_bytes = sample_string.encode("ascii")
base64_bytes = base64.b64encode(sample_string_bytes)
base64_string = base64_bytes.decode("ascii")
else:
token = base64_string+"."+random.choice(string.ascii_letters).upper()+''.join(random.choice(string.ascii_letters + string.digits)
for _ in range(5))+"."+''.join(random.choice(string.ascii_letters + string.digits) for _ in range(27))
count += 1
tokens.append(token)
proxies = proxygen.get_proxies()
proxy_pool = cycle(proxies)
for token in tokens:
proxy = next(proxy_pool)
header = {
"Content-Type": "application/json",
"authorization": token
}
try:
r = requests.get(url, headers=header, proxies={'https':"http://"+proxy})
print(r.text)
print(token)
if r.status_code == 200:
print(u"\u001b[32;1m[+] Token Works!\u001b[0m")
f = open(current_path+"/"+"workingtokens.txt", "a")
f.write(token+"\n")
elif "rate limited." in r.text:
print("[-] You are being rate limited.")
else:
print(u"\u001b[31m[-] Invalid Token.\u001b[0m")
except requests.exceptions.ProxyError:
print("BAD PROXY")
tokens.remove(token)
``
Try this code for get_proxies()
import requests
def get_proxies():
#in your example missing schema
url = 'https://proxy.link/list/get/5691264d3b19a600feef69dc3a27368d'
response = requests.get(url)
raw = response.text.split('\n')
proxies = set(raw)
return proxies
Output here

My code hangs when the server returns request error

My code is downloading the files perfectly, but it stops when the server returns a 404 error. How do I change this code to add only URLs that don't have a 404 error to the list?
import requests
import httplib2
import os
from bs4 import BeautifulSoup, SoupStrainer
artigos = []
pdfs = []
http = httplib2.Http()
status, response = http.request('https://www.snh2021.anpuh.org/site/anais')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
artigos.append(link['href'])
for x in artigos:
if x.endswith('pdf'):
pdfs.append(x)
print(pdfs)
def baixa_arquivo(url, endereco):
resposta = requests.get(url)
if resposta.status_code == requests.codes.OK:
with open(endereco, 'wb') as novo_arquivo:
novo_arquivo.write(resposta.content)
print('Download concluĂ­do. Salvo em {}'.format(endereco))
else:
resposta.raise_for_status()
if __name__ == '__main__':
url_basica = 'https://www.snh2021.anpuh.org/{}'
output = 'Download'
for i in range(1, len(pdfs)):
nome_do_arquivo = os.path.join(output, 'artigo{}.pdf'.format(i))
a = pdfs[i]
baixa_arquivo(url_basica.format(a), nome_do_arquivo)
I was able to fix the problem by adding a condition at the end of the code. I requested the status_code and canceled the download if the status was 404.
import requests
import httplib2
import os
from bs4 import BeautifulSoup, SoupStrainer
artigos = []
pdfs = []
http = httplib2.Http()
status, response = http.request('https://www.snh2021.anpuh.org/site/anais')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
artigos.append(link['href'])
for x in artigos:
if x.endswith('pdf'):
pdfs.append(x)
print(pdfs)
def baixa_arquivo(url, endereco):
resposta = requests.get(url)
if resposta.status_code == requests.codes.OK:
with open(endereco, 'wb') as novo_arquivo:
novo_arquivo.write(resposta.content)
print('Download concluĂ­do. Salvo em {}'.format(endereco))
else:
resposta.raise_for_status()
if __name__ == '__main__':
url_basica = 'https://www.snh2021.anpuh.org/{}'
output = 'Download'
for i in range(550, len(pdfs)):
nome_do_arquivo = os.path.join(output, 'artigo{}.pdf'.format(i))
a = pdfs[i]
z = url_basica.format(a)
y = requests.get(z)
if y.status_code!=404:
baixa_arquivo(z, nome_do_arquivo)

concurrent.futures multithreading with 2 lists as variables

So I would like to multi-thread the following working piece of code with concurrent futures but nothing I've tried so far seems to work.
def download(song_filename_list, song_link_list):
with requests.Session() as s:
login_request = s.post(login_url, data= payload, headers= headers)
for x in range(len(song_filename_list)):
download_request = s.get(song_link_list[x], headers= download_headers, stream=True)
if download_request.status_code == 200:
print(f"Downloading {x+1} out of {len(song_filename_list)}!\n")
pass
else:
print(f"\nStatus Code: {download_request.status_code}!\n")
sys.exit()
with open (song_filename_list[x], "wb") as file:
file.write(download_request.content)
The 2 main variables are the song_filename_list and the song_link_list.
The first list has names of each file and the second has all their respective download links.
So the name and link of each file are located at the same position.
For example: name_of_file1 = song_filename_list[0] and link_of_file1 = song_link_list[0]
This is the most recent attempt at multi-threading:
def download(song_filename_list, song_link_list):
with requests.Session() as s:
login_request = s.post(login_url, data= payload, headers= headers)
x = []
for i in range(len(song_filename_list)):
x.append(i)
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.submit(get_file, x)
def get_file(x):
download_request = s.get(song_link_list[x], headers= download_headers, stream=True)
if download_request.status_code == 200:
print(f"Downloading {x+1} out of {len(song_filename_list)}!\n")
pass
else:
print(f"\nStatus Code: {download_request.status_code}!\n")
sys.exit()
with open (song_filename_list[x], "wb") as file:
file.write(download_request.content)
Could someone explain to me what am I doing wrong?
Cause nothing happens after the get_file function call.
It skips all the code and exits without any errors, so where is my logic wrong?
EDIT 1:
After adding prints to:
print(song_filename_list, song_link_list)
with concurrent.futures.ThreadPoolExecutor() as executor:
print("Before executor.map")
executor.map(get_file, zip(song_filename_list, song_link_list))
print("After executor.map")
print(song_filename_list, song_link_list)
And to the start and end get_file and its file.write.
The output is as follows:
Succesfully logged in!
["songs names"] ["songs links"] <- These are correct.
Before executor.map
After executor.map
["songs names"] ["songs links"] <- These are correct.
Exiting.
In other words values are correct but it skips the get_file in the executor.map.
EDIT 2:
Here are the values used.
song_filename_list = ['100049 Himeringo - Yotsuya-san ni Yoroshiku.osz', '1001507 ZUTOMAYO - Kan Saete Kuyashiiwa.osz']
song_link_list = ['https://osu.ppy.sh/beatmapsets/100049/download', 'https://osu.ppy.sh/beatmapsets/1001507/download']
EDIT 3:
After some tinkering around it would seem that this works.
for i in range(len(song_filename_list)):
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.submit(get_file, song_filename_list, song_link_list, i, s)
def get_file(song_filename_list, song_link_list, i, s):
download_request = s.get(song_link_list[i], headers= download_headers, stream=True)
if download_request.status_code == 200:
print("Downloading...")
pass
else:
print(f"\nStatus Code: {download_request.status_code}!\n")
sys.exit()
with open (song_filename_list[i], "wb") as file:
file.write(download_request.content)
In your download() function you submit the whole array while you should submit each items:
def download(song_filename_list, song_link_list):
with requests.Session() as s:
login_request = s.post(login_url,
data=payload,
headers=headers)
for i in range(len(song_filename_list)):
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.submit(get_file, i)
You can simplify this with the executor .map() method:
def download(song_filename_list, song_link_list):
with requests.Session() as session:
session.post(login_url,
data=payload,
headers=headers)
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(get_file, song_filename_list, song_link_list)
Where the get_file function is:
def get_file(song_name, song_link):
with requests.Session() as session:
download_request = session.get(song_link,
headers=download_headers,
stream=True)
if download_request.status_code == 200:
print(f"Downloaded {song_name}")
else:
print(f"\nStatus Code: {download_request.status_code}!\n")
with open(song_name, "wb") as file:
file.write(download_request.content)
This avoid sharing state between threads, which avoids potential data races.
If you need to monitor how much songs have been downloaded, you can use tqdm which has a thread_map iterator wrapper that does exactly this.

Python : Manga parsing return empty file

i want to parse images from a "certain" manga and chapter. here's my code:
import requests, bs4, os, urllib.request
try:
url = "http://manganelo.com/chapter/read_one_punch_man_manga_online_free3/chapter_136"
res = requests.get(url)
print("[+] Asking a request to " + url)
# slice the url so it only contains the name and chapter
name = url[34:].replace("/", "_")
os.mkdir(name)
print("[+] Making '{}' directory".format(name))
os.chdir(os.path.join(os.getcwd(), name))
soup = bs4.BeautifulSoup(res.text, "html.parser")
for img in soup.findAll("img"):
manga_url = img.get("src")
manga_name = img.get("alt") + ".jpg"
urllib.request.urlretrieve(manga_url, manga_name)
print("[+] Downloading: " + manga_name)
except Exception as e:
print("[-] Error: " + str(e))
it works fine BUT only for a specific chapter, let's say i put chapter 130, when i try to run the code it returns blank file but if i put chapter 136 or others it works fine. How can this happen?
you can replace urllib.request.urlretrieve(manga_url, manga_name)
with :
r = requests.get(manga_url, stream=True)
if r.status_code == 200:
with open(manga_name, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
Actually Remote server is apparently checking the user agent header and rejecting requests from Python's urllib.
On the other hand you can use :
opener = urllib.request.URLopener()
opener.addheader('User-Agent', 'whatever')
opener.retrieve(manga_url, manga_name)
This works for me
Hope this helps

GET works, POST doesn't

I'm writing a small Python 2.x app which grabs images from URLs, converts them to base64, then submits them using requests to an API server as parameters of a POST request. My admittedly amateurish code is as follows:
import csv
import json
import requests
import base64
import getpass
f = raw_input("Enter CSV filename: ")
global clientCode
clientCode = raw_input("Enter customer code: ")
username = raw_input("Enter username: ")
password = getpass.getpass("Enter password: ")
global url
url = "https://" + clientCode + ".redacted.com/api"
def getSessionKey():
querystring = {"request":"verifyUser","username":username,"password":password,"clientCode":clientCode}
response = requests.request("GET", url, params=querystring, timeout=10)
jr = json.loads(response.text)
# print(response.text)
global sessionKey
sessionKey = jr['records'][0]['sessionKey']
errorCode = jr['status']['errorCode']
with open(f, 'rb') as myfile:
reader = csv.reader(myfile)
rownum = 0
getSessionKey()
for row in reader:
productID = row[0]
imageURL = row[1]
dlimage = requests.get(imageURL, stream=True, timeout=10)
encodedImage = base64.encodestring(dlimage.content)
imagequery = {'clientCode':clientCode,'sessionKey':sessionKey,'request':'saveProductPicture','productID':productID,'picture':encodedImage}
response = requests.post(url, data=imagequery, timeout=10)
print response.status_code
ir = json.loads(response.text)
errorCode = ir['status']['errorCode']
print errorCode
rownum = rownum + 1
Now, if I change the response line to response = requests.get(url, params=imagequery, timeout=10), it works. But since this is a GET request, the server throws an HTTP 414 error for any images larger than about 1kb. If I run the code as above, the API server gives an error which indicates it's not seeing the clientCode parameter, so it would stand to reason that it's not seeing any of the data. What am I doing wrong?
Thanks for helping me learn by doing.
I'm still not sure why requests was behaving the way it was, but I rewrote the code to use httplib instead, and it works.

Categories

Resources