writing variables in pandas dataframe with header

writing variables in pandas dataframe with header - python

i have a script that runs api calls agains a flask app. i want to create a pandas datafram with the statuscode and the elapsed time of the request which i can write to a csv file.
My problem is that only one entry ends up being in the csv file and i dont know why.
the headers should be "statuscode" and "elapsed time".
when i am printing the statuscode and elapsedtime variables every response is printed and not only one
with this csv file i want to create a grap to visualize the responstimes
i tried to write the "write_df" fuction but ended up using the variables from the requests in the "send_api_request" function.
import requests
import datetime
import concurrent.futures
import csv
import pandas as pd
HOST = 'http://127.0.0.1:5000'
API_PATH = '/'
ENDPOINT = HOST + API_PATH
MAX_THREADS = 8
CONCURRENT_THREADS = 10
csv_path = "flasktests.csv"
try:
file = open(csv_path, 'w', newline='')
writer = csv.writer(file)
except:
print("error opening or writing to the CSV file!")
def send_api_request():
try:
#print ('Sending API request: ', ENDPOINT)
r = requests.get(ENDPOINT)
if r.status_code == 200:
#print('Received: ', r.status_code, r.elapsed)
responses = {"statuscode":[r.status_code], "elapsed time": [r.elapsed]}
statuscode = r.status_code
elapsedtime = r.elapsed
print(statuscode, elapsedtime)
df = pd.DataFrame([statuscode,elapsedtime], columns=["statuscode","elapsed time"])
df.to_csv(csv_path, index=False)
elif r.status_code == 417:
print('Received error code:', r.status_code, r.json())
except Exception as e:
print("error",str(e))
def write_df(statuscode, elapsedtime):
print(statuscode,elapsedtime)
df = pd.DataFrame({"statuscode":[statuscode], "elapsed time": [elapsedtime]})
df.to_csv(csv_path, index=False)
print(df)
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
futures = [executor.submit(send_api_request) for x in range (CONCURRENT_THREADS)]
executor.shutdown(wait=True)
any ideas what i am doing wrong here?
Thank you!

df = pd.DataFrame([statuscode,elapsedtime], columns=["statuscode","elapsed, "elapsed time": [elapsedtime]})
df.to_csv(csv_path, index=False)
Here, you are writing a csv with only one entry, because your dataframe df only has one row.
And everytime this function is being executed, you will overwrite the existing csv. This is why the output file only contains one line (2 if you count the header, which should be there)
I see you are trying to use multithreading, which is probably not needed. If you do want to keep using it, you will need to make sure that you are not writing to the file with two threads at the same time, which is a lot of overhead. Instead, I would suggest you do something like this:
def send_api_request():
try:
#print ('Sending API request: ', ENDPOINT)
r = requests.get(ENDPOINT)
if r.status_code == 200:
#print('Received: ', r.status_code, r.elapsed)
return r.status_code, r.elapsed
elif r.status_code == 417:
print('Received error code:', r.status_code, r.json())
return r.status_code, None
except Exception as e:
print("error",str(e))
# number of times you want to call the API
nb_api_calls = 10
codes = []
elapsed_times = []
for i in range(nb_api_calls):
code, elapsed_time = send_api_request()
if elapsed_time is None:
# An error happened, choose what do do with that information
# Here I am just skipping it
pass
else:
codes.append(code)
elapsed_times.append(elapsed_time)
df = pd.DataFrame({"statuscode": codes,"elapsed time": elapsed_times})
df.to_csv(csv_path, index=False)
Note that, with the current configuration, the "statuscode" column will only contain the value 200 many times

Related

Python program looks hung. CTRL-C resumes it

I have a python program, which will make some api calls and print some output depending upon the api's response. I am using flush = true argument in the print function call. Sometimes the python program looks hung (It won't print anything to the terminal). But when i press CTRL-C, some output is printed to the output and python (looks to)resumes again. I have not written any signal handler for SIGINT. Why is this behavior?
UPDATE (Added python code)
import requests
import config
import json
from ipaddress import IPv4Address, IPv4Network
ip_list = []
filepath = "ip_address_test.txt"
with open(filepath) as fp:
line = fp.readline()
ip_list.append(line.rstrip())
while line:
line = fp.readline()
ip_list.append(line.rstrip())
print(ip_list)
del ip_list[-1]
sysparm_offset = 0
sysparm_limit = 2
flag = 1
for ip in ip_list:
hosts = []
while flag == 1:
print("ip is "+str(ip), flush=True)
# Set the request parameters
url = 'https://xxx.service-now.com/api/now/table/u_cmdb_ci_subnet?sysparm_query=u_lifecycle_status!=end of life^GOTOname>='+ip+'&sysparm_limit='+str(sysparm_limit)+'&sysparm_offset='+str(sysparm_offset)
# Eg. User name="username", Password="password" for this code sample.
user = config.asknow_username
pwd = config.asknow_password
headers = {"Accept":"application/json"}
# Do the HTTP request
response = requests.get(url, auth=(user, pwd), headers=headers,verify=False)
# Check for HTTP codes other than 200
if response.status_code != 200:
print('Status:', response.status_code, 'Headers:', response.headers, 'Error Response:', response.content)
exit()
# Decode the json response into a dictionary and use the data
result = json.loads(response.content)['result']
iter = 0
while iter < sysparm_limit:
print("iter = "+str(iter),flush=True)
print("checking "+result[iter]["name"], flush=True)
id = result[iter]["u_guid"]
url = "https://xxx.service-now.com/api/now/cmdb/instance/u_cmdb_ci_subnet/" + str(id)
response = requests.get(url, auth=(user, pwd), headers=headers,verify=False)
# Check for HTTP codes other than 200
if response.status_code != 200:
print('Status:', response.status_code, 'Headers:', response.headers, 'Error Response:', response.content)
exit()
result1 = json.loads(response.content)['result']
for relation in result1["outbound_relations"]:
if relation["type"]["display_value"] == "IP Connection::IP Connection":
if IPv4Address(relation["target"]["display_value"]) in IPv4Network(ip):
print("appending host", flush=True)
hosts.append(relation["target"]["display_value"])
else:
print("ip not in subnet", flush=True)
flag = 0
break
if flag == 0:
break
iter = iter+1
sysparm_offset = sysparm_offset + 2
with open('output.txt',"a+") as output_file:
for value in hosts:
output_file.write(str(value)+"\n")
print("completed "+ip,flush=True)
flag = 1

I am not sure if my issue is the same since you didn't mention clicking inside the command prompt and you didn't mention if it also resumes when pressing something besides ctrl-c, but it sounds a lot like what I had.
A little more googling led me to an insight I wish I had found as an answer to your question a while ago:
https://www.davici.nl/blog/the-mystery-of-hanging-batch-script-until-keypress
I had no idea a QuickEdit mode existed but besides following the steps in the article, you can also simply open the commandprompt, right-click the title bar, choose "defaults", and then under "Edit Options" disable "QuickEdit Mode".
I hope this helps you (or someone else) as much as it just helped me

how to check if cpanel can login successfully with python2.7?

i need a script to make it like a cpanel checker, with more than 1 url and the url is stored in a txt file.
usage : python script.py list.txt
format in file list.txt : https://demo.cpanel.net:2083|democom|DemoCoA5620
this is my code but it doesn't work, can someone help me?
Thanks.
import requests, sys
from multiprocessing.dummy import Pool as ThreadPool
try:
with open(sys.argv[1], 'r') as f:
list_data = [line.strip() for line in f if line.strip()]
except IOError:
pass
def cpanel(url):
try:
data = {'user':'democom', 'pass':'DemoCoA5620'}
r = requests.post(url, data=data)
if r.status_code==200:
print "login success"
else:
print "login failed"
except:
pass
def chekers(url):
try:
cpanel(url)
except:
pass
def Main():
try:
start = timer()
pp = ThreadPool(25)
pr = pp.map(chekers, list_data)
print('Time: ' + str(timer() - start) + ' seconds')
except:
pass
if __name__ == '__main__':
Main()

I fixed your code in a way that it will return an actual array containing a boolean array indicating the success of the cpanel function.
from __future__ import print_function
import requests
from multiprocessing.pool import ThreadPool
try:
list_data = ["https://demo.cpanel.net:2083|democom|DemoCoA5620",
"https://demo.cpanel.net:2083|UserDoesNotExist|WRONGPASSWORD",
]
except IOError:
pass
def cpanel(url):
try:
# try to split that url to get username / password
try:
url, username, password = url.split('|')
except Exception as e:
print("Url {} seems to have wrong format. Concrete error: {}".format(url, e))
return False
# build the correct url
url += '/login/?login_only=1'
# build post parameters
params = {'user': username,
'pass': password}
# make request
r = requests.post(url, params)
if r.status_code==200:
print("login for user {} success".format(username))
return True
else:
print("login for user {} failed due to Status Code {} and message \"{}\"".format(username, r.status_code, r.reason))
return False
except Exception as e:
print("Error occured for url {} ".format(e))
return False
def chekers(url):
return cpanel(url)
def Main():
try:
# start = timer()
pp = ThreadPool(1)
pr = pp.map(chekers, list_data)
print(pr)
# print('Time: ' + str(timer() - start) + ' seconds')
except:
pass
if __name__ == '__main__':
Main()
Output:
login for user democom success
login for user UserDoesNotExist failed due to Status Code 401 and message "Access Denied"
[True, False]
Be aware that I replaced your file read operation by some fixed urls.
Since you use request.post I guess you actually want to POST something to that urls. Your code does not do that. If you just want to send a request, use the requests.get method.
See the official documentation for the requests packet: https://2.python-requests.org/en/master/user/quickstart/#make-a-request for more details.
Also note that
"but it doesn't work"
is NOT a question.

python request urls parallel [duplicate]

This question already has an answer here:
How to send multiple http requests python
(1 answer)
Closed 6 years ago.
I created the following script to download images from an API endpoint which works as intended. Thing is that it is rather slow as all the requests have to wait on each other. What is the correct way to make it possible to still have the steps synchronously for each item I want to fetch, but make it parallel for each individual item. This from an online service called
servicem8
So what I hope to achieve is:
fetch all possible job ids => keep name/and other info
fetch name of the customer
fetch each attachment of a job
These three steps should be done for each job. So I could make things parallel for each job as they do not have to wait on each other.
Update:
Problem I do not understand is how can you make sure that you bundle for example the three calls per item in one call as its only per item that I can do things in parallel so for example when I want to
fetch item( fetch name => fetch description => fetch id)
so its the fetch item I want to make parallel?
The current code I have is working but rather slow:
import requests
import dateutil.parser
import shutil
import os
user = "test#test.com"
passw = "test"
print("Read json")
url = "https://api.servicem8.com/api_1.0/job.json"
r = requests.get(url, auth=(user, passw))
print("finished reading jobs.json file")
scheduled_jobs = []
if r.status_code == 200:
for item in r.json():
scheduled_date = item['job_is_scheduled_until_stamp']
try:
parsed_date = dateutil.parser.parse(scheduled_date)
if parsed_date.year == 2016:
if parsed_date.month == 10:
if parsed_date.day == 10:
url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item[
'company_uuid'])
c = requests.get(url_customer, auth=(user, passw))
cus_name = c.json()['name']
scheduled_jobs.append(
[item['uuid'], item['generated_job_id'], cus_name])
except ValueError:
pass
for job in scheduled_jobs:
print("fetch for job {}".format(job))
url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[
0])
r = requests.get(url, auth=(user, passw))
if r.json() == []:
pass
for attachment in r.json():
if attachment['active'] == 1 and attachment['file_type'] != '.pdf':
print("fetch for attachment {}".format(attachment))
url_staff = "https://api.servicem8.com/api_1.0/Staff.json?%24filter=uuid%20eq%20{}".format(
attachment['created_by_staff_uuid'])
s = requests.get(url_staff, auth=(user, passw))
for staff in s.json():
tech = "{}_{}".format(staff['first'], staff['last'])
url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
'uuid'])
r = requests.get(url, auth=(user, passw), stream=True)
if r.status_code == 200:
creation_date = dateutil.parser.parse(
attachment['timestamp']).strftime("%d.%m.%y")
if not os.path.exists(os.getcwd() + "/{}/{}".format(job[2], job[1])):
os.makedirs(os.getcwd() + "/{}/{}".format(job[2], job[1]))
path = os.getcwd() + "/{}/{}/SC -O {} {}{}".format(
job[2], job[1], creation_date, tech.upper(), attachment['file_type'])
print("writing file to path {}".format(path))
with open(path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print(r.text)
Update [14/10]
I updated the code in the following way with some hints given. Thanks a lot for that. Only thing I could optimize I guess is the attachment downloading but it is working fine now. Funny thing I learned is that you cannot create a CON folder on a windows machine :-) did not know that.
I use pandas as well just to try to avoid some loops in my list of dicts but not sure if I am already most performant. Longest is actually reading in the full json files. I fully read them in as I could not find an API way of just telling the api, return me only the jobs from september 2016. The api query function seems to work on eq/lt/ht.
import requests
import dateutil.parser
import shutil
import os
import pandas as pd
user = ""
passw = ""
FOLDER = os.getcwd()
headers = {"Accept-Encoding": "gzip, deflate"}
import grequests
urls = [
'https://api.servicem8.com/api_1.0/job.json',
'https://api.servicem8.com/api_1.0/Attachment.json',
'https://api.servicem8.com/api_1.0/Staff.json',
'https://api.servicem8.com/api_1.0/Company.json'
]
#Create a set of unsent Requests:
print("Read json files")
rs = (grequests.get(u, auth=(user, passw), headers=headers) for u in urls)
#Send them all at the same time:
jobs,attachments,staffs,companies = grequests.map(rs)
#create dataframes
df_jobs = pd.DataFrame(jobs.json())
df_attachments = pd.DataFrame(attachments.json())
df_staffs = pd.DataFrame(staffs.json())
df_companies = pd.DataFrame(companies.json())
#url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item['company_uuid'])
#c = requests.get(url_customer, auth=(user, passw))
#url = "https://api.servicem8.com/api_1.0/job.json"
#jobs = requests.get(url, auth=(user, passw), headers=headers)
#print("Reading attachments json")
#url = "https://api.servicem8.com/api_1.0/Attachment.json"
#attachments = requests.get(url, auth=(user, passw), headers=headers)
#print("Reading staff.json")
#url_staff = "https://api.servicem8.com/api_1.0/Staff.json"
#staffs = requests.get(url_staff, auth=(user, passw))
scheduled_jobs = []
if jobs.status_code == 200:
print("finished reading json file")
for job in jobs.json():
scheduled_date = job['job_is_scheduled_until_stamp']
try:
parsed_date = dateutil.parser.parse(scheduled_date)
if parsed_date.year == 2016:
if parsed_date.month == 9:
cus_name = df_companies[df_companies.uuid == job['company_uuid']].iloc[0]['name'].upper()
cus_name = cus_name.replace('/', '')
scheduled_jobs.append([job['uuid'], job['generated_job_id'], cus_name])
except ValueError:
pass
print("{} jobs to fetch".format(len(scheduled_jobs)))
for job in scheduled_jobs:
print("fetch for job attachments {}".format(job))
#url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[0])
if attachments == []:
pass
for attachment in attachments.json():
if attachment['related_object_uuid'] == job[0]:
if attachment['active'] == 1 and attachment['file_type'] != '.pdf' and attachment['attachment_source'] != 'INVOICE_SIGNOFF':
for staff in staffs.json():
if staff['uuid'] == attachment['created_by_staff_uuid']:
tech = "{}_{}".format(
staff['first'].split()[-1].strip(), staff['last'])
creation_timestamp = dateutil.parser.parse(
attachment['timestamp'])
creation_date = creation_timestamp.strftime("%d.%m.%y")
creation_time = creation_timestamp.strftime("%H_%M_%S")
path = FOLDER + "/{}/{}/SC_-O_D{}_T{}_{}{}".format(
job[2], job[1], creation_date, creation_time, tech.upper(), attachment['file_type'])
# fetch attachment
if not os.path.isfile(path):
url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
'uuid'])
r = requests.get(url, auth=(user, passw), stream = True)
if r.status_code == 200:
if not os.path.exists(FOLDER + "/{}/{}".format(job[2], job[1])):
os.makedirs(
FOLDER + "/{}/{}".format(job[2], job[1]))
print("writing file to path {}".format(path))
with open(path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print("file already exists")
else:
print(r.text)

General idea is to use asynchronous url requests and there is a python module named grequests for that-https://github.com/kennethreitz/grequests
From Documentation:
import grequests
urls = [
'http://www.heroku.com',
'http://python-tablib.org',
'http://httpbin.org',
'http://python-requests.org',
'http://fakedomain/',
'http://kennethreitz.com'
]
#Create a set of unsent Requests:
rs = (grequests.get(u) for u in urls)
#Send them all at the same time:
grequests.map(rs)
And the resopnse
[<Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, None, <Response [200]>]

How to properly debug ThreadPool?

I'm trying to get some data from a web page. To speed up this process (they allow me to make 1000 requests per minute), I use ThreadPool.
Since there is a huge amount of data, the process is quite vulnerable to connection fails etc. so I try to log everything I can to be able to detect each mistake I did in code.
The problem is that program sometimes just stops without any exception (it acts like it is running but with no effect - I use PyCharm). I log catched exceptions everywhere I can but I can't see any exception in any log.
I assume that if there were a timeout reached, the exception would be raised and logged.
I've found out where the problem could be. Here is the code:
As a pool, I use: from multiprocessing.pool import ThreadPool as Pool
And lock: from threading import Lock
The download_category function is being used in loop.
def download_category(url):
# some code
#
# ...
log('Create pool...')
_pool = Pool(_workers_number)
with open('database/temp_produkty.txt') as f:
log('Spracovavanie produktov... vytvaranie vlakien...') # I see this in log
for url_product in f:
x = _pool.apply_async(process_product, args=(url_product.strip('\n'), url))
_pool.close()
_pool.join()
log('Presuvanie produktov z temp export do export.csv...') # I can't see this in log
temp_export_to_export_csv()
set_spracovanie_kategorie(url)
except Exception as e:
logging.exception('Got exception on download_one_category: {}'.format(url))
And process_product function:
def process_product(url, cat):
try:
data = get_product_data(url)
except:
log('{}: {} exception while getting product data... #') # I don't see this in log
return
try:
print_to_temp_export(data, cat) # I don't see this in log
except:
log('{}: {} exception while printing to csv... #') # I don't see this in log
raise
LOG function:
def log(text):
now = datetime.now().strftime('%d.%m.%Y %H:%M:%S')
_lock.acquire()
mLib.printToFile('logging/log.log', '{} -> {}'.format(now, text))
_lock.release()
I use logging module too. In this log, I see that probably 8 (number of workers) times request was sent but no answer hasn't been recieved.
EDIT1:
def get_product_data(url):
data = defaultdict(lambda: '-')
root = load_root(url)
try:
nazov = root.xpath('//h1[#itemprop="name"]/text()')[0]
except:
nazov = root.xpath('//h1/text()')[0]
under_block = root.xpath('//h2[#id="lowest-cost"]')
if len(under_block) < 1:
under_block = root.xpath('//h2[contains(text(),"Naj")]')
if len(under_block) < 1:
return False
data['nazov'] = nazov
data['url'] = url
blocks = under_block[0].xpath('./following-sibling::div[#class="shp"]/div[contains(#class,"shp")]')
i = 0
for block in blocks:
i += 1
data['dat{}_men'.format(i)] = eblock.xpath('.//a[#class="link"]/text()')[0]
del root
return data
LOAD ROOT:
class RedirectException(Exception):
pass
def load_url(url):
r = requests.get(url, allow_redirects=False)
if r.status_code == 301:
raise RedirectException
if r.status_code == 404:
if '-q-' in url:
url = url.replace('-q-','-')
mLib.printToFileWOEncoding('logging/neexistujuce.txt','Skusanie {} kategorie...'.format(url))
return load_url(url) # THIS IS NOT LOOPING
else:
mLib.printToFileWOEncoding('logging/neexistujuce.txt','{}'.format(url))
html = r.text
return html
def load_root(url):
try:
html = load_url(url)
except Exception as e:
logging.exception('load_root_exception')
raise
return etree.fromstring(html, etree.HTMLParser())

using python stream downloading file with server limit

I tried to download file from a server using python, sometimes the file is very large, I would like to have some progress bar, one way to do this I can come up with is to download in a stream, so that I can print the progress. Currently I have tried the standard urlopen, urlretrieve, and requests module (with stream on).
Obviously, urlopen cannot download file in stream, requests module support this, however, the server has limit on the file I can download at one time (its limit is 1). So everytime, I tried to use requests, it only get the webpage told me to wait, is there any other way to do this?

I have very recently downloaded many types of media with this function:
import sys
import requests
import time
def download_resource(domain, url, file_name = None, download = True):
cookies = {}
s = requests.Session()
s.config['keep_alive'] = True
#add your own cookies here, I have a specific function I call
#for my application but yours is different
r = s.get(url, cookies = cookies, stream = True)
if not r.ok:
print "error in downloading"
return -1
file_size = int(r.headers['content-length'])
if not file_name:
try:
temp = r.headers['content-disposition']
except Exception as e:
pass
#failing download
return -1
else:
if not temp:
return -1
else:
file_name = temp.split("filename=")[-1]
return_obj["filename"] = file_name
#print "File size:", file_size
#print "\n", str(self.entire_size / float(1024*1024*1024)), "\n"
print "Downloading:", file_name
if download:
with open(file_name, "wb") as fh:
count = 1
chunk_size = 1048576
start_time = time.time()
try:
for block in r.iter_content(chunk_size):
total_time = time.time() - start_time
percent = count*chunk_size/float(file_size) * 100.0
fraction = int(percent/5)
download_speed = 1.0 / total_time
sys.stdout.write('\r')
sys.stdout.write("[%-20s] %d%% %3.2f MB/s " % ('='* fraction , percent, download_speed))
sys.stdout.flush()
if not block:
break
fh.write(block)
count += 1
start_time = time.time()
except Exception as e:
print e
finally:
#close up the stream
r.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

writing variables in pandas dataframe with header - python

Related

Python program looks hung. CTRL-C resumes it

how to check if cpanel can login successfully with python2.7?

python request urls parallel [duplicate]

How to properly debug ThreadPool?

using python stream downloading file with server limit

Categories

Resources