How to make a dataframe from print result in for loop - python

I need to print my for loop results in to a dataframe. Here is my for loop..
import os
for filename in os.listdir("/data/rrd_dump_xml/"):
if filename.endswith(".xml") :
totaldir="/data/rrd_dump_xml/"+filename
tree=et.parse(totaldir)
root=tree.getroot()
NAME = []
for name in root.iter('name'):
NAME.append(name.text)
UPDATE = []
for update in root.iter('lastupdate'):
UPDATE.append(update.text)
updated = datetime.datetime.fromtimestamp(int(UPDATE[0]))
lastupdate=updated.strftime('%Y-%m-%d %H:%M:%S')
ParaValue = []
for parameterevalue in root.iter('value'):
ParaValue.append(parameterevalue.text)
print(filename,lastupdate,NAME[0],ParaValue[0])
print(filename,lastupdate,NAME[1],ParaValue[1])
else:
print("Error")
I need to get an dataframe with below format of column headers..
filename lastupdate Name Value
Note: In each file in for loop, there will be two print results( print(filename,lastupdate,NAME[0],ParaValue[0]) and print(filename,lastupdate,NAME[1],ParaValue[1]) )
can some one help me to do this? I checked with some examples
Writing output of a for loop to pandas data-frame but when I use those methods I am not getting correct output.
Tried sample answer.
df = pd.DataFrame(list(zip(cutoff_list , number_list)),
columns =['cutoff', 'number'])

Instead of printing the output, add it to a list, and convert the list to a dataframe.
import os
import pandas as pd
content = []
for filename in os.listdir("/data/rrd_dump_xml/"):
if filename.endswith(".xml") :
totaldir="/data/rrd_dump_xml/"+filename
tree=et.parse(totaldir)
root=tree.getroot()
NAME = []
for name in root.iter('name'):
NAME.append(name.text)
UPDATE = []
for update in root.iter('lastupdate'):
UPDATE.append(update.text)
updated = datetime.datetime.fromtimestamp(int(UPDATE[0]))
lastupdate=updated.strftime('%Y-%m-%d %H:%M:%S')
ParaValue = []
for parameterevalue in root.iter('value'):
ParaValue.append(parameterevalue.text)
# print(filename,lastupdate,NAME[0],ParaValue[0])
content.append({"filename": filename,
"lastupdate": lastupdate,
"Name": NAME[0],
"Value": ParaValue[0]})
# print(filename,lastupdate,NAME[1],ParaValue[1])
content.append({"filename": filename,
"lastupdate": lastupdate,
"Name": NAME[1],
"Value": ParaValue[1]})
else:
print("Error")
dataframe = pd.DataFrame(content)

Related

How to convert your script into a class?

I wrote a code to convert some text in pdf file into a pandas dataframe. Code works very well normally, but when I try to fit it into class and define function for it, it returns with error.
import pdfplumber
import pandas as pd
import re
cols = ["Declaration Number", "Declaration Date", "Warehouse", "Quantity", "Number of boxes", "Product name", "Invoice Number"]
dataset = []
quant = []
date = []
decl_date = []
decl = re.compile(r'\d{8}AN\d{6}')
decd = re.compile(r'\d{2}\.\d{2}\.\d{4}')
whse = re.compile(r'ANTREPO | LÄ°MAN')
qty = re.compile(r'\d.KAP')
prod = re.compile(r'Ticari')
invNo = re.compile(r'Fatura')
class pdf():
def __init__(self):
self.kap = None
self.kg = None
def FirstPage():
with pdfplumber.open("44550500AN087999.pdf") as pdf:
page = pdf.pages[0]
text = page.extract_text()
for line in text.split('\n'):
if decl.search(line):
decl_num = line.split()[-1]
if decd.search(line):
decl_date = []
date = []
decl_date.append(line.split())
date = decl_date[1][-1]
if whse.search(line):
warehouse = line.split()
if qty.search(line):
quant = line.split()
kap = quant[0] + " " + quant[1]
kg = quant[2] + " " + quant[3]
when I run it it returns with several errors:
For instance:
<ipython-input-26-bc082b4afef0> in FirstPage()
20 date = []
21 decl_date.append(line.split())
---> 22 date = decl_date[1][-1]
23 if whse.search(line):
24 warehouse = line.split()
IndexError: list index out of range
I am probably defining the variables wrong but I am a newby so, anyone have any idea what am I doing wrong?
You are only putting one element into decl_date, and then trying to access the second element inside that list, which does not exist.
Your use of line.split() seems incorrect to me. The way you have used them essentially only puts the string into a 1-element list "string" -> ["string"].
I assume you want to split the string by using the regex in each if-statement, in that case change line.split() to pattern.split(line)[index], swapping out pattern and index

Fastest way to iterate function over pandas dataframe

I have a function which operates over lines of a csv file, adding values of different cells to dictionaries depending on whether conditions are met:
df = pd.concat([pd.read_csv(filename) for filename in args.csv], ignore_index = True)
ID_Use_Totals = {}
ID_Order_Dates = {}
ID_Received_Dates = {}
ID_Refs = {}
IDs = args.ID
def TSQs(row):
global ID_Use_Totals, ID_Order_Dates, ID_Received_Dates
if row['Stock Item'] not in IDs:
pass
else:
if row['Action'] in ['Order/Resupply', 'Cons. Purchase']:
if row['Stock Item'] not in ID_Order_Dates:
ID_Order_Dates[row['Stock Item']] = [{row['Ref']: pd.to_datetime(row['TransDate'])}]
else:
ID_Order_Dates[row['Stock Item']].append({row['Ref']: pd.to_datetime(row['TransDate'])})
elif row['Action'] == 'Received':
if row['Stock Item'] not in ID_Received_Dates:
ID_Received_Dates[row['Stock Item']] = [{row['Ref']: pd.to_datetime(row['TransDate'])}]
else:
ID_Received_Dates[row['Stock Item']].append({row['Ref']: pd.to_datetime(row['TransDate'])})
elif row['Action'] == 'Use':
if row['Stock Item'] in ID_Use_Totals:
ID_Use_Totals[row['Stock Item']].append(row['Qty'])
else:
ID_Use_Totals[row['Stock Item']] = [row['Qty']]
else:
pass
Currently, I am doing:
for index, row in df.iterrows():
TSQs(row)
But timer() returns between 70 and 90 seconds for a 40,000 line csv file.
I want to know what the fastest way of implementing this is over the entire dataframe (which could potentially be hundreds of thousands of rows).
I'd wager not using Pandas for this could be faster.
Additionally, you can use defaultdicts to avoid having to check whether you've seen a given product yet:
import csv
import collections
import datetime
ID_Use_Totals = collections.defaultdict(list)
ID_Order_Dates = collections.defaultdict(list)
ID_Received_Dates = collections.defaultdict(list)
ID_Refs = {}
IDs = set(args.ID)
order_actions = {"Order/Resupply", "Cons. Purchase"}
for filename in args.csv:
with open(filename) as f:
for row in csv.DictReader(f):
item = row["Stock Item"]
if item not in IDs:
continue
ref = row["Ref"]
action = row["Action"]
if action in order_actions:
date = datetime.datetime.fromisoformat(row["TransDate"])
ID_Order_Dates[item].append({ref: date})
elif action == "Received":
date = datetime.datetime.fromisoformat(row["TransDate"])
ID_Received_Dates[item].append({ref: date})
elif action == "Use":
ID_Use_Totals[item].append(row["Qty"])
EDIT: If the CSV is really of the form
"Employee", "Stock Location", "Stock Item"
"Ordered", "16", "32142"
the stock CSV module can't quite parse it.
You could use Pandas to parse the file, then iterate over rows, though I'm not sure if this'll end up being much faster in the end:
import collections
import datetime
import pandas
ID_Use_Totals = collections.defaultdict(list)
ID_Order_Dates = collections.defaultdict(list)
ID_Received_Dates = collections.defaultdict(list)
ID_Refs = {}
IDs = set(args.ID)
order_actions = {"Order/Resupply", "Cons. Purchase"}
for filename in args.csv:
for index, row in pd.read_csv(filename).iterrows():
item = row["Stock Item"]
if item not in IDs:
continue
ref = row["Ref"]
action = row["Action"]
if action in order_actions:
date = datetime.datetime.fromisoformat(row["TransDate"])
ID_Order_Dates[item].append({ref: date})
elif action == "Received":
date = datetime.datetime.fromisoformat(row["TransDate"])
ID_Received_Dates[item].append({ref: date})
elif action == "Use":
ID_Use_Totals[item].append(row["Qty"])
You can use the apply function. The code will look like this:
df.apply(TSQs, axis=1)
Here when axis=1, each row will be sent to the function TSQs as a pd.Series from where you can index like row["Ref"] to get value of that line. Since this is a vector operation, it will run so much after that a for loop.
Probably fastest not to iterate at all:
# Build some boolean indices for your various conditions
idx_stock_item = df["Stock Item"].isin(IDs)
idx_purchases = df["Action"].isin(['Order/Resupply', 'Cons. Purchase'])
idx_order_dates = df["Stock Item"].isin(ID_Order_Dates)
# combine the indices to act on specific rows all at once
idx_combined = idx_stock_item & idx_purchases & ~idx_order_dates
# It looks like you were putting a single entry dictionary in each row - wouldn't it make sense to rather just use two columns? i.e. take advantage of the DataFrame data structure
ID_Order_Dates.loc[df.loc[idx_combined, "Stock Item"], "Ref"] = df.loc[idx_combined, "Ref"]
ID_Order_Dates.loc[df.loc[idx_combined, "Stock Item"], "Date"] = df.loc[idx_combined, "TransDate"]
# repeat for your other cases
# ...

Multivalue SIngle Key Dictionary in Python3

I have the following dictionary in Python. I want to access the 2nd value and assign it to a new var new time. Im trying to do it using list however I am unable to arrive at the answer I need
exampledict = {
"a": ["url1", "file_name1"],
"b": ["url2", "filename2"],
"d": ["url4", "filename4"],
"c": ["url3", "filename3"],}
for key, value in exampledict.items():
url = value[0]
filename = value[1]
# do stuff with url and filename
# later:
# I want to do something will all my filenames without getting them again from the dict.
So the actual result would be storing filename is 4 diff variables each time so that I can access it. Im trying to do this using an emptylist and then extending.But doing that gives me four seprate files in list format rather then one list containing all the 4 files (so i can access using index).Im new to programming and would appreciate a step by step help and what logic mistake I have made
Kindly ignore any syntax errors in the code
No need to extend, simply fill a list iteratively:
exampledict = {
"a": ["url1", "file_name1"],
"b": ["url2", "filename2"],
"d": ["url4", "filename4"],
"c": ["url3", "filename3"],}
filenames = []
for key, value in exampledict.items():
url = value[0]
filename = value[1]
# do more stuff
print ("In loop", url, filename)
filenames.append(filename)
print(filenames)
Output:
In loop url1 file_name1
In loop url2 filename2
In loop url4 filename4
In loop url3 filename3
['file_name1', 'filename2', 'filename4', 'filename3']
If you are just interested in the filenames, you can extract them directly:
fns = [filename for _,filename in exampledict.values()]
you missed a coma in your dictionary in "d" row,
try this:
urls = []
files = []
exampledict = {
"a": ["url1", "file_name1"],
"b": ["url2", "filename2"],
"d": ["url4", "filename4"],
"c": ["url3", "filename3"],}
for key, value in exampledict.items():
url = value[0]
urls.append(url)
filename = value[1]
files.append(filename)
now you acces the info in the arrays urls and files like:
print(urls[0])... print(urls[3])
print(files[0])...print(files[3])

Convert Json to CSV using Python

Below, is the json structure I am pulling from my online weather station. I am also including a json_to_csv python script that is supposed to convert json data to csv output, but only returns a "Key" error. I want to pull data from "current_observation": only.
{
"response": {
"features": {
"conditions": 1
}
}
, "current_observation": {
"display_location": {
"latitude":"40.466442",
"longitude":"-85.362709",
"elevation":"280.4"
},
"observation_time_rfc822":"Fri, 26 Jan 2018 09:40:16 -0500",
"local_time_rfc822":"Sun, 28 Jan 2018 11:22:47 -0500",
"local_epoch":"1517156567",
"local_tz_short":"EST",
"weather":"Clear",
"temperature_string":"44.6 F (7.0 C)",
}
}
import csv, json, sys
inputFile = open("pywu.cache.json", 'r') #open json file
outputFile = open("CurrentObs.csv", 'w') #load csv file
data = json.load(inputFile) #load json content
inputFile.close() #close the input file
output = csv.writer(outputFile) #create a csv.write
output.writerow(data[0].keys())
for row in data:
output = csv.writer(outputFile) #create a csv.write
output.writerow(data[0].keys())
for row in data:
output.writerow(row.values()) #values row
What's the best method to retrieve the temperature string and convert to .csv format? Thank you!
import pandas as pd
df = pd.read_json("pywu.cache.json")
df = df.loc[["local_time_rfc822", "weather", "temperature_string"],"current_observation"].T
df.to_csv("pywu.cache.csv")
maybe pandas can be of help for you. the .read_json() function creates a nice dataframe, from which you can easily choose the desired rows and columns. and it can save as csv as well.
to add latitude and longitude to the csv-line, you can do this:
df = pd.read_json("pywu.cache.csv")
df = df.loc[["local_time_rfc822", "weather", "temperature_string", "display_location"],"current_observation"].T
df = df.append(pd.Series([df["display_location"]["latitude"], df["display_location"]["longitude"]], index=["latitude", "longitude"]))
df = df.drop("display_location")
df.to_csv("pywu.cache.csv")
to print the location in numeric values, you can do this:
df = pd.to_numeric(df, errors="ignore")
print(df['latitude'], df['longitude'])
This will find all keys (e.g. "temperature_string") specified inside of the json blob and then write them to a csv file. You can modify this code to get multiple keys.
import csv, json, sys
def find_deep_value(d, key):
# Find a the value of keys hidden within a dict[dict[...]]
# Modified from https://stackoverflow.com/questions/9807634/find-all-occurrences-of-a-key-in-nested-python-dictionaries-and-lists
# #param d dictionary to search through
# #param key to find
if key in d:
yield d[key]
for k in d.keys():
if isinstance(d[k], dict):
for j in find_deep_value(d[k], key):
yield j
inputFile = open("pywu.cache.json", 'r') # open json file
outputFile = open("mypws.csv", 'w') # load csv file
data = json.load(inputFile) # load json content
inputFile.close() # close the input file
output = csv.writer(outputFile) # create a csv.write
# Gives you a list of temperature_strings from within the json
temps = list(find_deep_value(data, "temperature_string"))
output.writerow(temps)
outputFile.close()

Pandas error : can only use .str accessor with string values which use no.object dtype

My input file is a Json file
{ "infile":"c:/tmp/cust-in-sample.xlsx",
"SheetName":"Sheet1",
"CleanColumns":[1,2],
"DeleteColumns":[3,5],
"outfile":"c:/tmp/out-cust-in-sample.csv"
}
I would like to have specified columns in json to cleaned and deleted. However I'm getting the pandas string error.
I'm currently trying this code:
import json
import pandas as pd
import gzip
import shutil
import sys
zJsonFile = sys.argv[-1]
iCount = len(sys.argv)
if iCount == 2:
print "json file path " ,zJsonFile
else:
print "need a json file path ending the script"
sys.exit()
with open(zJsonFile,'rb') as zTestJson:
decoded = json.load(zTestJson)
#Parameterizing the code, reading each key from 'decoded' variable and putting it into another variable for the purpose
#of parameterizing
Infile = decoded.get('infile')
print Infile
Outfile = decoded.get('outfile')
print Outfile
Sheetname = decoded.get('SheetName')
print Sheetname
# this is a list
deletecols = decoded.get('DeleteColumns')
print deletecols
#this is a list
cleancols = decoded.get('CleanColumns')
print cleancols
input_sheet = pd.ExcelFile(Infile)
dfs = {}
for x in [Sheetname]:
dfs[x] = input_sheet.parse(x)
print dfs
df = pd.DataFrame(dfs[x]) # COnverting dict to dataframe
print df
deletecols = df.columns.values.tolist()
cleancols = df.columns.values.tolist()
for idx,item in enumerate(deletecols):
df.pop(item)
#df.drop(df.columns[deletecols],axis=1,inplace=True)
#Cleaning the code
#cleancols=[]
for x in cleancols:
df[x] = df[x].str.replace(to_replace = '"', value = '', regex = True)
df[x] = df[x].str.replace(to_replace = "'", value = '', regex = True)
df[x] = df[x].str.replace(to_replace = ",", value = '', regex = True)
I tried df.pop, df.drop nothing of this looks like its working for me and neither creating a loop and looping through cleancols is cleaning my file.
Any help is highly appreciated.!

Categories

Resources