concurrent.futures multithreading with 2 lists as variables

concurrent.futures multithreading with 2 lists as variables - python

So I would like to multi-thread the following working piece of code with concurrent futures but nothing I've tried so far seems to work.
def download(song_filename_list, song_link_list):
with requests.Session() as s:
login_request = s.post(login_url, data= payload, headers= headers)
for x in range(len(song_filename_list)):
download_request = s.get(song_link_list[x], headers= download_headers, stream=True)
if download_request.status_code == 200:
print(f"Downloading {x+1} out of {len(song_filename_list)}!\n")
pass
else:
print(f"\nStatus Code: {download_request.status_code}!\n")
sys.exit()
with open (song_filename_list[x], "wb") as file:
file.write(download_request.content)
The 2 main variables are the song_filename_list and the song_link_list.
The first list has names of each file and the second has all their respective download links.
So the name and link of each file are located at the same position.
For example: name_of_file1 = song_filename_list[0] and link_of_file1 = song_link_list[0]
This is the most recent attempt at multi-threading:
def download(song_filename_list, song_link_list):
with requests.Session() as s:
login_request = s.post(login_url, data= payload, headers= headers)
x = []
for i in range(len(song_filename_list)):
x.append(i)
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.submit(get_file, x)
def get_file(x):
download_request = s.get(song_link_list[x], headers= download_headers, stream=True)
if download_request.status_code == 200:
print(f"Downloading {x+1} out of {len(song_filename_list)}!\n")
pass
else:
print(f"\nStatus Code: {download_request.status_code}!\n")
sys.exit()
with open (song_filename_list[x], "wb") as file:
file.write(download_request.content)
Could someone explain to me what am I doing wrong?
Cause nothing happens after the get_file function call.
It skips all the code and exits without any errors, so where is my logic wrong?
EDIT 1:
After adding prints to:
print(song_filename_list, song_link_list)
with concurrent.futures.ThreadPoolExecutor() as executor:
print("Before executor.map")
executor.map(get_file, zip(song_filename_list, song_link_list))
print("After executor.map")
print(song_filename_list, song_link_list)
And to the start and end get_file and its file.write.
The output is as follows:
Succesfully logged in!
["songs names"] ["songs links"] <- These are correct.
Before executor.map
After executor.map
["songs names"] ["songs links"] <- These are correct.
Exiting.
In other words values are correct but it skips the get_file in the executor.map.
EDIT 2:
Here are the values used.
song_filename_list = ['100049 Himeringo - Yotsuya-san ni Yoroshiku.osz', '1001507 ZUTOMAYO - Kan Saete Kuyashiiwa.osz']
song_link_list = ['https://osu.ppy.sh/beatmapsets/100049/download', 'https://osu.ppy.sh/beatmapsets/1001507/download']
EDIT 3:
After some tinkering around it would seem that this works.
for i in range(len(song_filename_list)):
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.submit(get_file, song_filename_list, song_link_list, i, s)
def get_file(song_filename_list, song_link_list, i, s):
download_request = s.get(song_link_list[i], headers= download_headers, stream=True)
if download_request.status_code == 200:
print("Downloading...")
pass
else:
print(f"\nStatus Code: {download_request.status_code}!\n")
sys.exit()
with open (song_filename_list[i], "wb") as file:
file.write(download_request.content)

In your download() function you submit the whole array while you should submit each items:
def download(song_filename_list, song_link_list):
with requests.Session() as s:
login_request = s.post(login_url,
data=payload,
headers=headers)
for i in range(len(song_filename_list)):
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.submit(get_file, i)
You can simplify this with the executor .map() method:
def download(song_filename_list, song_link_list):
with requests.Session() as session:
session.post(login_url,
data=payload,
headers=headers)
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(get_file, song_filename_list, song_link_list)
Where the get_file function is:
def get_file(song_name, song_link):
with requests.Session() as session:
download_request = session.get(song_link,
headers=download_headers,
stream=True)
if download_request.status_code == 200:
print(f"Downloaded {song_name}")
else:
print(f"\nStatus Code: {download_request.status_code}!\n")
with open(song_name, "wb") as file:
file.write(download_request.content)
This avoid sharing state between threads, which avoids potential data races.
If you need to monitor how much songs have been downloaded, you can use tqdm which has a thread_map iterator wrapper that does exactly this.

Related

How can I make python request module asyncio and aiohttp

def get_ship_position(ship_id):
import requests
url ="https://www.marinetraffic.com/en/vesselDetails/vesselInfo/shipid:{}".format(ship_id)
headers = {
"accept": "application/json",
"accept-encoding": "gzip, deflate",
"user-agent": "Mozilla/5.0",
"x-requested-with": "XMLHttpRequest"
}
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.json()
def main():
from time import perf_counter
start = perf_counter()
i = 7550
while (i <= 9999 ):
i+=1
try:
data = get_ship_position(i)
with open("marinetraffic.txt","a",encoding="utf-8") as bos:
print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}x{}\t{}\t{}\t{}\t{}\t{}\t{}".format(data["mmsi"], data["imo"],data["name"],data["nameAis"],data["type"],data["typeSpecific"],data["yearBuilt"],data["length"],data["breadth"],data["callsign"],data["country"],data["deadweight"],data["grossTonnage"],data["homePort"],data["status"]),file=bos)
print(i,"Yazdı")
except Exception:
print(i,"Hata")
with open("marinetraffichata.txt","a",encoding="utf-8") as hata:
print("Hata",i,file=hata)
pass
stop = perf_counter()
print("çalışılan süre:", stop - start,"saniye")
# return 0
if __name__ == "__main__":
import sys
sys.exit(main())
I am progressing very slowly with the request module, how can I make the code run fast? I've seen the aiohttp and async modules and they are really fast. How can I adapt my own code?

Using asyncio and aiohttp is certainly one way of being able to do concurrent URL retrievals. But I am wondering if it is the best way given (1) you are already using requests and (2) you want to retrieve 2450 URLs, but not necessarily all at the same time.
By using a multithreading pool of size N, you would have N threads concurrently retrieving up to N URLs. By setting an "appropriate" value for N you can control the degree of concurrency. Performance could improve by increasing N but at some point as N got larger, performance could start to decrease. There is also the possibility that the website might think you are performing a Denial of Service attack by making so many concurrent requests.
In the code below I am using a value of 64 for N and creating a Session instance for doing the retrievals, which should also improve performance. I am using method multiprocessing.pool.Threadpool.imap to process the returned data elements as they become available. This method returns an iterator that when iterated will return the next return value from your worker function, get_ship_position. However, I am explicitly using method next to iterate so that I can individually handle exceptions that are raised by get_ship_posiion. If I were to use instead for data in pool.imap(worker, range(7551, 10_001)) to iterate then once an exception is raised by an invocation of get_ship_position, I would not be able to continue iterating subsequent results.
def get_ship_position(session, ship_id):
url ="https://www.marinetraffic.com/en/vesselDetails/vesselInfo/shipid:{}".format(ship_id)
response = session.get(url)
response.raise_for_status()
return response.json()
def main():
from time import perf_counter
import requests
from multiprocessing.pool import ThreadPool
from functools import partial
start = perf_counter()
with requests.Session() as session:
headers = {
"accept": "application/json",
"accept-encoding": "gzip, deflate",
"user-agent": "Mozilla/5.0",
"x-requested-with": "XMLHttpRequest"
}
session.headers = headers
with ThreadPool(64) as pool:
worker = partial(get_ship_position, session)
it = pool.imap(worker, range(7551, 10_001))
i = 7550
with open("marinetraffic.txt","a",encoding="utf-8") as f:
while True:
i += 1
try:
data = next(it)
print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}x{}\t{}\t{}\t{}\t{}\t{}\t{}".format(data["mmsi"], data["imo"],data["name"],data["nameAis"],data["type"],data["typeSpecific"],data["yearBuilt"],data["length"],data["breadth"],data["callsign"],data["country"],data["deadweight"],data["grossTonnage"],data["homePort"],data["status"]),file=f)
print(i,"Yazdı")
except StopIteration:
break
except Exception:
print(i,"Hata")
print("Hata",i,file=f)
stop = perf_counter()
print("çalışılan süre:", stop - start,"saniye")
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
Using asyncio and aiohttp
The following code uses asyncio and aiohttp. A semaphore set to 64 controls the number of coroutines that can be running concurrently so that you can control the number of concurrent get requests are made. Again, this number, set to 64, can be adjusted to see how performance varies.
import asyncio
async def get_ship_position(session, ship_id):
url ="https://www.marinetraffic.com/en/vesselDetails/vesselInfo/shipid:{}".format(ship_id)
async with session.get(url) as response:
status = response.status
if status != 200:
raise Exception(f'Bad status: {status}')
return await response.json()
async def bounded_fetch(sem, session, ship_id):
async with sem:
result = await get_ship_position(session, ship_id)
return result
async def main():
from time import perf_counter
import aiohttp
start = perf_counter()
headers = {
"accept": "application/json",
"accept-encoding": "gzip, deflate",
"user-agent": "Mozilla/5.0",
"x-requested-with": "XMLHttpRequest"
}
async with aiohttp.ClientSession(headers=headers) as session:
sem = asyncio.Semaphore(64)
responses = await asyncio.gather(*(bounded_fetch(sem, session, i) for i in range(7551, 10_001)), return_exceptions=True)
with open("marinetraffic.txt","a",encoding="utf-8") as f:
for i, data in enumerate(responses, start=7551):
if isinstance(data, Exception):
print(i,"Hata")
print("Hata",i,file=f)
else:
print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}x{}\t{}\t{}\t{}\t{}\t{}\t{}".format(data["mmsi"], data["imo"],data["name"],data["nameAis"],data["type"],data["typeSpecific"],data["yearBuilt"],data["length"],data["breadth"],data["callsign"],data["country"],data["deadweight"],data["grossTonnage"],data["homePort"],data["status"]),file=f)
print(i,"Yazdı")
stop = perf_counter()
print("çalışılan süre:", stop - start,"saniye")
return 0
if __name__ == "__main__":
import sys
rc = asyncio.get_event_loop().run_until_complete(main())
sys.exit(rc)
Note
With either version successive runs can produce widely different run times.
Update
If you want to write results to the output file as results are returned instead of after all coroutines have completed then try:
... # code omitted
import aiofiles
async with aiohttp.ClientSession(headers=headers) as session:
sem = asyncio.Semaphore(64)
tasks = [asyncio.create_task(bounded_fetch(sem, session, i)) for i in range(7551, 10_001)]
async with aiofiles.open("marinetraffic.txt", "w", encoding="utf-8") as f:
for i, task in enumerate(tasks, start=7551):
try:
await task
data = task.result()
record = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}x{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(data["mmsi"], data["imo"],data["name"],data["nameAis"],data["type"],data["typeSpecific"],data["yearBuilt"],data["length"],data["breadth"],data["callsign"],data["country"],data["deadweight"],data["grossTonnage"],data["homePort"],data["status"])
await f.write(record)
print(i,"Yazdı")
except:
print(i,"Hata")
print("Hata",i,file=f)
print(i,"Yazdı")
except:
print(i,"Hata")
print("Hata",i,file=f)
... # code omitted

How to asynchronous run http requests in python

this code i'm running synchronously
import requests
URL = "http://maps.googleapis.com/maps/api/geocode/json"
location = "delhi technological university"
PARAMS = {'address':location}
for _ in range(1,100):
r = requests.get(url = URL, params = PARAMS)
print(r)
how to run the same code asynchronously using python
tried this:
import requests
import asyncio
loop = asyncio.get_event_loop()
URL = "http://maps.googleapis.com/maps/api/geocode/json"
location = "delhi technological university"
PARAMS = {'address':location}
async def run():
for j in range(1,100):
r = requests.get(url = URL, params = PARAMS)
if __name__ == "__main__":
loop.run_until_complete(run())
loop.close()
tried the above code but getting run time error . RuntimeError: This event loop is already running

python asyncio aiohttp timeout

Word of notice: This is my first approach with asyncio, so I might have done something really stupid.
Scenario is as follows:
I need to "http-ping" a humongous list of urls to check if they respond 200 or any other value. I get timeouts for each and every request, though tools like gobuster report 200,403, etc.
My code is sth similar to this:
import asyncio,aiohttp
import datetime
#-------------------------------------------------------------------------------------
async def get_data_coroutine(session,url,follow_redirects,timeout_seconds,retries):
#print('#DEBUG '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+' '+url)
try:
async with session.get(url,allow_redirects=False,timeout=timeout_seconds) as response:
status = response.status
#res = await response.text()
if( status==404):
pass
elif(300<=status and status<400):
location = str(response).split("Location': \'")[1].split("\'")[0]
print('#HIT '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+' '+str(status)+' '+url+' ---> '+location)
if(follow_redirects==True):
return await get_data_coroutine(session,location,follow_redirects,timeout_seconds,retries)
else:
print('#HIT '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+' '+str(status)+' '+url)
return None
except asyncio.exceptions.TimeoutError as e:
print('#ERROR '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+' '+' '+' '+url+' TIMEOUT '+str(e))
return None
#---------------------------------------------------------------------------
async def main(loop):
base_url = 'http://192.168.59.37'
extensions = ['','.html','php']
fd = open('/usr/share/wordlists/dirb/common.txt','r')
words_without_suffix = [x.strip() for x in fd.readlines()]#[-5:] #DEBUG!
words_with_suffix = [base_url+'/'+x+y for x in words_without_suffix for y in extensions]
follow = True
total_timeout = aiohttp.ClientTimeout(total=60*60*24)
timeout_seconds = 10
retries = 1
async with aiohttp.ClientSession(loop=loop,timeout=total_timeout) as session:
tasks = [get_data_coroutine(session,url,follow,timeout_seconds,retries) for url in words_with_suffix]
await asyncio.gather(*tasks)
print('DONE')
#---------------------------------------------------------------------------
if(__name__=='__main__'):
loop = asyncio.get_event_loop()
result = loop.run_until_complete(main(loop))
Did I do something really wrong?
Any word of advice?
Thank you SO much!

Actually, I ended up finding an open issue in aio-libs/aiohttp:
https://github.com/aio-libs/aiohttp/issues/3203
This way, they suggest a workaround that achieves my needs:
session_timeout = aiohttp.ClientTimeout(total=None,sock_connect=timeout_seconds,sock_read=timeout_seconds)
async with aiohttp.ClientSession(timeout=session_timeout) as session:
async with session.get(url,allow_redirects=False,timeout=1) as response:
...

To answer your question - no you did nothing wrong. I can't see anything wrong with your code in terms of http request/response/timeout handling.
If indeed all your requests are timing out to the host (http://192.168.59.37) I suspect the issues are you are experiencing are most likely down to how your network is resolving requests (or how your code is building the url).
You can confirm whether requests are independently succeeding/failing using a tool like curl, eg:
curl "http://192.168.59.37/abc.html"
I tested it locally by using
python3 -m http.server 8080
and placing an empty files 'abc' and 'abc.html' in the same directory, updating the base_url
base_url = "http://127.0.0.1:8080"
with my minor updates (code below) here's the output.
http://127.0.0.1:8080/.bashrc.php
#404
http://127.0.0.1:8080/.bashrc
#404
http://127.0.0.1:8080/.bashrc.html
#404
http://127.0.0.1:8080/abc
#HIT 2020-11-03 12:57:33 200 http://127.0.0.1:8080/abc
http://127.0.0.1:8080/zt.php
#404
http://127.0.0.1:8080/zt.html
#404
http://127.0.0.1:8080/zt
#404
http://127.0.0.1:8080/abc.html
#HIT 2020-11-03 12:57:33 200 http://127.0.0.1:8080/abc.html
http://127.0.0.1:8080/abc.php
#404
DONE
My updates are mostly minor but it might help with further debugging.
For debug, print the url. Important to determine if the code was building the url correctly. This highlighted to me that 'php' extension is missing a ".", so it would be looking for abcphp, not abc.php.
Use response.ok to test a successful http response, your code wasn't handling 500 errors (instead it was returning hit).
using python f-string for cleaner formatting
import asyncio
import aiohttp
import datetime
async def get_data_coroutine(session, url, follow_redirects, timeout_seconds, retries):
try:
async with session.get(
url, allow_redirects=False, timeout=timeout_seconds
) as response:
print(url)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if response.ok:
print(f"#HIT {now} {response.status} {url}")
else:
status = response.status
if status == 404:
print("#404")
elif 300 <= status and status < 400:
location = str(response).split("Location': '")[1].split("'")[0]
print(f"#HIT {now} {status} {url} ---> {location}")
if follow_redirects is True:
return await get_data_coroutine(
session, location, follow_redirects, timeout_seconds, retries
)
else:
print("#ERROR ", response.status)
return None
except asyncio.TimeoutError as e:
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"#ERROR {now} {url} TIMEOUT ", e)
return None
async def main(loop):
base_url = "http://127.0.0.1:8080"
extensions = ["", ".html", ".php"]
fd = open("/usr/share/wordlists/dirb/common.txt", "r")
words_without_suffix = [x.strip() for x in fd.readlines()]
words_with_suffix = [
base_url + "/" + x + y for x in words_without_suffix for y in extensions
]
follow = True
total_timeout = aiohttp.ClientTimeout(total=60 * 60 * 24)
timeout_seconds = 10
retries = 1
async with aiohttp.ClientSession(loop=loop, timeout=total_timeout) as session:
tasks = [
get_data_coroutine(session, url, follow, timeout_seconds, retries)
for url in words_with_suffix
]
await asyncio.gather(*tasks)
print("DONE")
if __name__ == "__main__":
loop = asyncio.get_event_loop()
result = loop.run_until_complete(main(loop))

multiprocessing ThreadPool stops at the end

I wrote a script that "parses" all domains from the file. After the launch, everything works as it should. But when there are several domains left at the end, it gets stuck. Sometimes it takes a long time to parse the last couple of domains. I can't figure out what the problem is. Who has faced such a situation? Tell me how to cure it.
After the launch, everything works out very quickly (as it should) until the end. At the end, it stops when there are several domains left. There is no difference, 1000 domains or 10 000 domains.
Complete code:
import re
import sys
import json
import requests
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
pool = 100
with open("Rules.json") as file:
REGEX = json.loads(file.read())
ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'}
def Domain_checker(domain):
try:
r = requests.get("http://" + domain, verify=False, headers=ua)
r.encoding = "utf-8"
for company in REGEX.keys():
for type in REGEX[company]:
check_entry = 0
for ph_regex in REGEX[company][type]:
if bool(re.search(ph_regex, r.text)) is True:
check_entry += 1
if check_entry == len(REGEX[company][type]):
title = BeautifulSoup(r.text, "lxml")
Found_domain = "\nCompany: {0}\nRule: {1}\nURL: {2}\nTitle: {3}\n".format(company, type, r.url, title.title.text)
print(Found_domain)
with open("/tmp/__FOUND_DOMAINS__.txt", "a", encoding='utf-8', errors = 'ignore') as file:
file.write(Found_domain)
except requests.exceptions.ConnectionError:
pass
except requests.exceptions.TooManyRedirects:
pass
except requests.exceptions.InvalidSchema:
pass
except requests.exceptions.InvalidURL:
pass
except UnicodeError:
pass
except requests.exceptions.ChunkedEncodingError:
pass
except requests.exceptions.ContentDecodingError:
pass
except AttributeError:
pass
except ValueError:
pass
return domain
if __name__ == '__main__':
with open(sys.argv[1], "r", encoding='utf-8', errors = 'ignore') as file:
Domains = file.read().split()
pool = 100
print("Pool = ", pool)
results = ThreadPool(pool).imap_unordered(Domain_checker, Domains)
string_num = 0
for result in results:
print("{0} => {1}".format(string_num, result))
string_num += 1
with open("/tmp/__FOUND_DOMAINS__.txt", encoding='utf-8', errors = 'ignore') as found_domains:
found_domains = found_domains.read()
print("{0}\n{1}".format("#" * 40, found_domains))

requests.get("http://" + domain, headers=ua, verify=False, timeout=10)
The problem is resolved after installing timeout
Thank you to the user with the nickname "eri" (https://ru.stackoverflow.com/users/16574/eri) :)

python request urls parallel [duplicate]

This question already has an answer here:
How to send multiple http requests python
(1 answer)
Closed 6 years ago.
I created the following script to download images from an API endpoint which works as intended. Thing is that it is rather slow as all the requests have to wait on each other. What is the correct way to make it possible to still have the steps synchronously for each item I want to fetch, but make it parallel for each individual item. This from an online service called
servicem8
So what I hope to achieve is:
fetch all possible job ids => keep name/and other info
fetch name of the customer
fetch each attachment of a job
These three steps should be done for each job. So I could make things parallel for each job as they do not have to wait on each other.
Update:
Problem I do not understand is how can you make sure that you bundle for example the three calls per item in one call as its only per item that I can do things in parallel so for example when I want to
fetch item( fetch name => fetch description => fetch id)
so its the fetch item I want to make parallel?
The current code I have is working but rather slow:
import requests
import dateutil.parser
import shutil
import os
user = "test#test.com"
passw = "test"
print("Read json")
url = "https://api.servicem8.com/api_1.0/job.json"
r = requests.get(url, auth=(user, passw))
print("finished reading jobs.json file")
scheduled_jobs = []
if r.status_code == 200:
for item in r.json():
scheduled_date = item['job_is_scheduled_until_stamp']
try:
parsed_date = dateutil.parser.parse(scheduled_date)
if parsed_date.year == 2016:
if parsed_date.month == 10:
if parsed_date.day == 10:
url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item[
'company_uuid'])
c = requests.get(url_customer, auth=(user, passw))
cus_name = c.json()['name']
scheduled_jobs.append(
[item['uuid'], item['generated_job_id'], cus_name])
except ValueError:
pass
for job in scheduled_jobs:
print("fetch for job {}".format(job))
url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[
0])
r = requests.get(url, auth=(user, passw))
if r.json() == []:
pass
for attachment in r.json():
if attachment['active'] == 1 and attachment['file_type'] != '.pdf':
print("fetch for attachment {}".format(attachment))
url_staff = "https://api.servicem8.com/api_1.0/Staff.json?%24filter=uuid%20eq%20{}".format(
attachment['created_by_staff_uuid'])
s = requests.get(url_staff, auth=(user, passw))
for staff in s.json():
tech = "{}_{}".format(staff['first'], staff['last'])
url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
'uuid'])
r = requests.get(url, auth=(user, passw), stream=True)
if r.status_code == 200:
creation_date = dateutil.parser.parse(
attachment['timestamp']).strftime("%d.%m.%y")
if not os.path.exists(os.getcwd() + "/{}/{}".format(job[2], job[1])):
os.makedirs(os.getcwd() + "/{}/{}".format(job[2], job[1]))
path = os.getcwd() + "/{}/{}/SC -O {} {}{}".format(
job[2], job[1], creation_date, tech.upper(), attachment['file_type'])
print("writing file to path {}".format(path))
with open(path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print(r.text)
Update [14/10]
I updated the code in the following way with some hints given. Thanks a lot for that. Only thing I could optimize I guess is the attachment downloading but it is working fine now. Funny thing I learned is that you cannot create a CON folder on a windows machine :-) did not know that.
I use pandas as well just to try to avoid some loops in my list of dicts but not sure if I am already most performant. Longest is actually reading in the full json files. I fully read them in as I could not find an API way of just telling the api, return me only the jobs from september 2016. The api query function seems to work on eq/lt/ht.
import requests
import dateutil.parser
import shutil
import os
import pandas as pd
user = ""
passw = ""
FOLDER = os.getcwd()
headers = {"Accept-Encoding": "gzip, deflate"}
import grequests
urls = [
'https://api.servicem8.com/api_1.0/job.json',
'https://api.servicem8.com/api_1.0/Attachment.json',
'https://api.servicem8.com/api_1.0/Staff.json',
'https://api.servicem8.com/api_1.0/Company.json'
]
#Create a set of unsent Requests:
print("Read json files")
rs = (grequests.get(u, auth=(user, passw), headers=headers) for u in urls)
#Send them all at the same time:
jobs,attachments,staffs,companies = grequests.map(rs)
#create dataframes
df_jobs = pd.DataFrame(jobs.json())
df_attachments = pd.DataFrame(attachments.json())
df_staffs = pd.DataFrame(staffs.json())
df_companies = pd.DataFrame(companies.json())
#url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item['company_uuid'])
#c = requests.get(url_customer, auth=(user, passw))
#url = "https://api.servicem8.com/api_1.0/job.json"
#jobs = requests.get(url, auth=(user, passw), headers=headers)
#print("Reading attachments json")
#url = "https://api.servicem8.com/api_1.0/Attachment.json"
#attachments = requests.get(url, auth=(user, passw), headers=headers)
#print("Reading staff.json")
#url_staff = "https://api.servicem8.com/api_1.0/Staff.json"
#staffs = requests.get(url_staff, auth=(user, passw))
scheduled_jobs = []
if jobs.status_code == 200:
print("finished reading json file")
for job in jobs.json():
scheduled_date = job['job_is_scheduled_until_stamp']
try:
parsed_date = dateutil.parser.parse(scheduled_date)
if parsed_date.year == 2016:
if parsed_date.month == 9:
cus_name = df_companies[df_companies.uuid == job['company_uuid']].iloc[0]['name'].upper()
cus_name = cus_name.replace('/', '')
scheduled_jobs.append([job['uuid'], job['generated_job_id'], cus_name])
except ValueError:
pass
print("{} jobs to fetch".format(len(scheduled_jobs)))
for job in scheduled_jobs:
print("fetch for job attachments {}".format(job))
#url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[0])
if attachments == []:
pass
for attachment in attachments.json():
if attachment['related_object_uuid'] == job[0]:
if attachment['active'] == 1 and attachment['file_type'] != '.pdf' and attachment['attachment_source'] != 'INVOICE_SIGNOFF':
for staff in staffs.json():
if staff['uuid'] == attachment['created_by_staff_uuid']:
tech = "{}_{}".format(
staff['first'].split()[-1].strip(), staff['last'])
creation_timestamp = dateutil.parser.parse(
attachment['timestamp'])
creation_date = creation_timestamp.strftime("%d.%m.%y")
creation_time = creation_timestamp.strftime("%H_%M_%S")
path = FOLDER + "/{}/{}/SC_-O_D{}_T{}_{}{}".format(
job[2], job[1], creation_date, creation_time, tech.upper(), attachment['file_type'])
# fetch attachment
if not os.path.isfile(path):
url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
'uuid'])
r = requests.get(url, auth=(user, passw), stream = True)
if r.status_code == 200:
if not os.path.exists(FOLDER + "/{}/{}".format(job[2], job[1])):
os.makedirs(
FOLDER + "/{}/{}".format(job[2], job[1]))
print("writing file to path {}".format(path))
with open(path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print("file already exists")
else:
print(r.text)

General idea is to use asynchronous url requests and there is a python module named grequests for that-https://github.com/kennethreitz/grequests
From Documentation:
import grequests
urls = [
'http://www.heroku.com',
'http://python-tablib.org',
'http://httpbin.org',
'http://python-requests.org',
'http://fakedomain/',
'http://kennethreitz.com'
]
#Create a set of unsent Requests:
rs = (grequests.get(u) for u in urls)
#Send them all at the same time:
grequests.map(rs)
And the resopnse
[<Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, None, <Response [200]>]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

concurrent.futures multithreading with 2 lists as variables - python

Related

How can I make python request module asyncio and aiohttp

How to asynchronous run http requests in python

python asyncio aiohttp timeout

multiprocessing ThreadPool stops at the end

python request urls parallel [duplicate]

Categories

Resources