Fastest way to iterate function over pandas dataframe - python

I have a function which operates over lines of a csv file, adding values of different cells to dictionaries depending on whether conditions are met:
df = pd.concat([pd.read_csv(filename) for filename in args.csv], ignore_index = True)
ID_Use_Totals = {}
ID_Order_Dates = {}
ID_Received_Dates = {}
ID_Refs = {}
IDs = args.ID
def TSQs(row):
global ID_Use_Totals, ID_Order_Dates, ID_Received_Dates
if row['Stock Item'] not in IDs:
pass
else:
if row['Action'] in ['Order/Resupply', 'Cons. Purchase']:
if row['Stock Item'] not in ID_Order_Dates:
ID_Order_Dates[row['Stock Item']] = [{row['Ref']: pd.to_datetime(row['TransDate'])}]
else:
ID_Order_Dates[row['Stock Item']].append({row['Ref']: pd.to_datetime(row['TransDate'])})
elif row['Action'] == 'Received':
if row['Stock Item'] not in ID_Received_Dates:
ID_Received_Dates[row['Stock Item']] = [{row['Ref']: pd.to_datetime(row['TransDate'])}]
else:
ID_Received_Dates[row['Stock Item']].append({row['Ref']: pd.to_datetime(row['TransDate'])})
elif row['Action'] == 'Use':
if row['Stock Item'] in ID_Use_Totals:
ID_Use_Totals[row['Stock Item']].append(row['Qty'])
else:
ID_Use_Totals[row['Stock Item']] = [row['Qty']]
else:
pass
Currently, I am doing:
for index, row in df.iterrows():
TSQs(row)
But timer() returns between 70 and 90 seconds for a 40,000 line csv file.
I want to know what the fastest way of implementing this is over the entire dataframe (which could potentially be hundreds of thousands of rows).

I'd wager not using Pandas for this could be faster.
Additionally, you can use defaultdicts to avoid having to check whether you've seen a given product yet:
import csv
import collections
import datetime
ID_Use_Totals = collections.defaultdict(list)
ID_Order_Dates = collections.defaultdict(list)
ID_Received_Dates = collections.defaultdict(list)
ID_Refs = {}
IDs = set(args.ID)
order_actions = {"Order/Resupply", "Cons. Purchase"}
for filename in args.csv:
with open(filename) as f:
for row in csv.DictReader(f):
item = row["Stock Item"]
if item not in IDs:
continue
ref = row["Ref"]
action = row["Action"]
if action in order_actions:
date = datetime.datetime.fromisoformat(row["TransDate"])
ID_Order_Dates[item].append({ref: date})
elif action == "Received":
date = datetime.datetime.fromisoformat(row["TransDate"])
ID_Received_Dates[item].append({ref: date})
elif action == "Use":
ID_Use_Totals[item].append(row["Qty"])
EDIT: If the CSV is really of the form
"Employee", "Stock Location", "Stock Item"
"Ordered", "16", "32142"
the stock CSV module can't quite parse it.
You could use Pandas to parse the file, then iterate over rows, though I'm not sure if this'll end up being much faster in the end:
import collections
import datetime
import pandas
ID_Use_Totals = collections.defaultdict(list)
ID_Order_Dates = collections.defaultdict(list)
ID_Received_Dates = collections.defaultdict(list)
ID_Refs = {}
IDs = set(args.ID)
order_actions = {"Order/Resupply", "Cons. Purchase"}
for filename in args.csv:
for index, row in pd.read_csv(filename).iterrows():
item = row["Stock Item"]
if item not in IDs:
continue
ref = row["Ref"]
action = row["Action"]
if action in order_actions:
date = datetime.datetime.fromisoformat(row["TransDate"])
ID_Order_Dates[item].append({ref: date})
elif action == "Received":
date = datetime.datetime.fromisoformat(row["TransDate"])
ID_Received_Dates[item].append({ref: date})
elif action == "Use":
ID_Use_Totals[item].append(row["Qty"])

You can use the apply function. The code will look like this:
df.apply(TSQs, axis=1)
Here when axis=1, each row will be sent to the function TSQs as a pd.Series from where you can index like row["Ref"] to get value of that line. Since this is a vector operation, it will run so much after that a for loop.

Probably fastest not to iterate at all:
# Build some boolean indices for your various conditions
idx_stock_item = df["Stock Item"].isin(IDs)
idx_purchases = df["Action"].isin(['Order/Resupply', 'Cons. Purchase'])
idx_order_dates = df["Stock Item"].isin(ID_Order_Dates)
# combine the indices to act on specific rows all at once
idx_combined = idx_stock_item & idx_purchases & ~idx_order_dates
# It looks like you were putting a single entry dictionary in each row - wouldn't it make sense to rather just use two columns? i.e. take advantage of the DataFrame data structure
ID_Order_Dates.loc[df.loc[idx_combined, "Stock Item"], "Ref"] = df.loc[idx_combined, "Ref"]
ID_Order_Dates.loc[df.loc[idx_combined, "Stock Item"], "Date"] = df.loc[idx_combined, "TransDate"]
# repeat for your other cases
# ...

Related

Skip First Column in CSV File with Pandas

I have a csv file that is generated that has some information in the first line. I'm trying to skip it but it doesn't seem to work. I tried looking at several suggestions and examples.
I tried using skiprows.
I also looked at several other examples.
Pandas drop first columns after csv read
https://datascientyst.com/pandas-read-csv-file-read_csv-skiprows/
Nothing I tried worked the way I wanted it.
When I got it to work it deleted the entire row.
Here is a sample of the code
# Imports the Pandas Module. It must be installed to run this script.
import pandas as pd
# Gets source file link
source_file = 'Csvfile.csv'
# Gets csv file and encodes it into a format that is compatible.
dataframe = pd.read_csv(source_copy, encoding='latin1')
df = pd.DataFrame({'User': dataframe.User, 'Pages': dataframe.Pages, 'Copies': dataframe.Copies,
'Color': dataframe.Grayscale, 'Duplex': dataframe.Duplex, 'Printer': dataframe.Printer})
# Formats data so that it can be used to count Duplex and Color pages.
df.loc[df["Duplex"] == "DUPLEX", "Duplex"] = dataframe.Pages
df.loc[df["Duplex"] == "NOT DUPLEX", "Duplex"] = 0
df.loc[df["Color"] == "NOT GRAYSCALE", "Color"] = dataframe.Pages
df.loc[df["Color"] == "GRAYSCALE", "Color"] = 0
df.sort_values(by=['User', 'Pages'])
file = df.to_csv('PrinterLogData.csv', index=False)
# Opens parsed CSV file.
output_source = "PrinterLogData.csv"
dataframe = pd.read_csv(output_source, encoding='latin1')
# Creates new DataFrame.
df = pd.DataFrame({'User': dataframe.User, 'Pages': dataframe.Pages, 'Copies': dataframe.Copies,
'Color': dataframe.Color, 'Duplex': dataframe.Duplex, 'Printer':
dataframe.Printer})
# Groups data by Users and Printer Sums
Report1 = df.groupby(['User'], as_index=False).sum().sort_values('Pages', ascending=False)
Report2 = (df.groupby(['Printer'], as_index=False).sum()).sort_values('Pages', ascending=False)
Sample Data
Sample Output of what I'm looking for.
This is an early draft of what you appear to want for your program (based on the simulated print-log.csv):
import csv
import itertools
import operator
import pathlib
CSV_FILE = pathlib.Path('print-log.csv')
EXTRA_COLUMNS = ['Pages', 'Grayscale', 'Color', 'Not Duplex', 'Duplex']
def main():
with CSV_FILE.open('rt', newline='') as file:
iterator = iter(file)
next(iterator) # skip first line if needed
reader = csv.DictReader(iterator)
table = list(reader)
create_report(table, 'Printer')
create_report(table, 'User')
def create_report(table, column_name):
key = operator.itemgetter(column_name)
table.sort(key=key)
field_names = [column_name] + EXTRA_COLUMNS
with pathlib.Path(f'{column_name} Report').with_suffix('.csv').open(
'wt', newline=''
) as file:
writer = csv.DictWriter(file, field_names)
writer.writeheader()
report = []
for key, group in itertools.groupby(table, key):
report.append({column_name: key} | analyze_group(group))
report.sort(key=operator.itemgetter('Pages'), reverse=True)
writer.writerows(report)
def analyze_group(group):
summary = dict.fromkeys(EXTRA_COLUMNS, 0)
for row in group:
pages = int(row['Pages']) * int(row['Copies'])
summary['Pages'] += pages
summary['Grayscale'] += pages if row['Grayscale'] == 'GRAYSCALE' else 0
summary['Color'] += pages if row['Grayscale'] == 'NOT GRAYSCALE' else 0
summary['Not Duplex'] += pages if row['Duplex'] == 'NOT DUPLEX' else 0
summary['Duplex'] += pages if row['Duplex'] == 'DUPLEX' else 0
return summary
if __name__ == '__main__':
main()

How to make a dataframe from print result in for loop

I need to print my for loop results in to a dataframe. Here is my for loop..
import os
for filename in os.listdir("/data/rrd_dump_xml/"):
if filename.endswith(".xml") :
totaldir="/data/rrd_dump_xml/"+filename
tree=et.parse(totaldir)
root=tree.getroot()
NAME = []
for name in root.iter('name'):
NAME.append(name.text)
UPDATE = []
for update in root.iter('lastupdate'):
UPDATE.append(update.text)
updated = datetime.datetime.fromtimestamp(int(UPDATE[0]))
lastupdate=updated.strftime('%Y-%m-%d %H:%M:%S')
ParaValue = []
for parameterevalue in root.iter('value'):
ParaValue.append(parameterevalue.text)
print(filename,lastupdate,NAME[0],ParaValue[0])
print(filename,lastupdate,NAME[1],ParaValue[1])
else:
print("Error")
I need to get an dataframe with below format of column headers..
filename lastupdate Name Value
Note: In each file in for loop, there will be two print results( print(filename,lastupdate,NAME[0],ParaValue[0]) and print(filename,lastupdate,NAME[1],ParaValue[1]) )
can some one help me to do this? I checked with some examples
Writing output of a for loop to pandas data-frame but when I use those methods I am not getting correct output.
Tried sample answer.
df = pd.DataFrame(list(zip(cutoff_list , number_list)),
columns =['cutoff', 'number'])
Instead of printing the output, add it to a list, and convert the list to a dataframe.
import os
import pandas as pd
content = []
for filename in os.listdir("/data/rrd_dump_xml/"):
if filename.endswith(".xml") :
totaldir="/data/rrd_dump_xml/"+filename
tree=et.parse(totaldir)
root=tree.getroot()
NAME = []
for name in root.iter('name'):
NAME.append(name.text)
UPDATE = []
for update in root.iter('lastupdate'):
UPDATE.append(update.text)
updated = datetime.datetime.fromtimestamp(int(UPDATE[0]))
lastupdate=updated.strftime('%Y-%m-%d %H:%M:%S')
ParaValue = []
for parameterevalue in root.iter('value'):
ParaValue.append(parameterevalue.text)
# print(filename,lastupdate,NAME[0],ParaValue[0])
content.append({"filename": filename,
"lastupdate": lastupdate,
"Name": NAME[0],
"Value": ParaValue[0]})
# print(filename,lastupdate,NAME[1],ParaValue[1])
content.append({"filename": filename,
"lastupdate": lastupdate,
"Name": NAME[1],
"Value": ParaValue[1]})
else:
print("Error")
dataframe = pd.DataFrame(content)

csv.DictReader delimiter inside a csv field with multiple quotes

I have the following example csv, that I am reading with:
f = StringIO(response.encode('utf-8'))
reader = csv.DictReader(f, quotechar='"', delimiter=';', quoting=csv.QUOTE_ALL, skipinitialspace=True)
example csv:
id;name;community;owner;owns;description;uuid
3c;NP;NoProb;NoP;Text;text_with_no_issues;
3c;NP;NoProb;NoP;TextText;text_with_no_issues2;
1A;fooo;barr;Bar;TEXT1;"\"text\"\"None\"\";text\"\"TEXT\"\"text\"";
1A;fooo;barr;Bar;TEXT2;"\"text\"\"None\"\";text\"\"TEXT\"\"text\"";
2B;BAR;foo;Bar;TEXT3;"\"text\"\"None\"\";text\"\"TEXT\"\"text\";text\"\"TEXT\"\"text\"";
2B;BAR;foo;Bar;TEXT4;"\"text\"\"None\"\";text\"\"TEXT\"\"text\";text\"\"TEXT\"\"text\"";
the uuid column is empty in all cases.
within the "reader" there are multiple entries with the same 'name' and 'id' which I am "merging", but in lines like the last four (1A,2B) I am hitting an issue because of the ";" delimiter in the description.
Even with quotechar='"' and quoting=csv.QUOTE_ALL the description column gets spitted by the delimiter and goes to the next column (uuid) and to a "None" column which corrupts my data.
Any idea how to solve this one ?
P.S. for the merge logic I am using two variants:
##############################################################
name_index = []
result = []
for line in reader:
idx = line["name"]
if idx not in name_index:
name_index.append(idx)
result.append(line)
else:
idx_curr_dict = result[name_index.index(idx)]
merge_entries = [idx_curr_dict, line]
placeholder = {}
for key in idx_curr_dict:
placeholder[key] = ", ".join(list(set(d[key] for d in merge_entries if d[key] != "" and d[key])))
result[name_index.index(idx)] = placeholder
##############################################################
and a bit slower one, but not that complicated:
##############################################################
data = [line for line in reply] # Deplete the iterator
unique_names = set([line['name'] for line in data]) # List of unique names
column_names = [key for key in data[0] if key != 'name' and key != 'uuid'] # all other useful columns
result = []
for name in unique_names:
same_named_lines = [line for line in data if line['name'] == name]
unique_line = {'name': name}
for column in column_names:
value = ", ".join(set([line[column] for line in same_named_lines]))
unique_line[column] = value
result.append(unique_line)
##############################################################
Thanks a lot in advance!

Need assistance with dictionaries, csv files, and lists

Alright, so I need a code that will take a csv file and reads the values in it (so far I've gotten that part down).
What I'm having trouble with is creating a list with those values, and ordering them in order of less re-occurring to most re-occurring. There can be no duplicate values either.
Here's what I have:
import csv
B = []
K = []
def readandprocess(name):
with open(name, newline='') as csvf:
freader = csv.reader(csvf,delimiter=',',quotechar='"')
datasg = {}
artists = []
for row in freader:
artist = row[2]
B.append(artist)
for artist in B:
c = B.count(artist)
K.append(artist + str(c))
list(set(K))
print(K)
#for row in freader:
#artist = row[2]
###song = row[1]
#if artist == 'Rolling Stones':
# print('Rolling Stones title: ',row[1])
#if artist not in datasg:
# datasg[artist] = [song]
#else:
#datasg[artist].append(song)
#for artist in datasg:
#print(artist, datasg[artist])
print( '--------------------------------------')
info = datasg.items()
# tosort = [(len(t[1]),t[0]) for t in info]
# info = sorted(tosort)
# print(info[-30:])
# print(info)
print(len(datasg)) # currently 0, populate at will #Number of keys in dictionary
return datasg
if __name__ == '__main__':
datasg = readandprocess('data/top1000.csv')
Try using Counter. Once you have all the items you need in a list, you can use a Counter, and then call most_common(n) to get the n most common elements.

Most effective way to parse CSV and take action based on content of row

I have a CSV file that Splunk generates, similar in format to the following:
Category,URL,Hash,ID,"__mv_Hash","_mkv_ID"
binary,somebadsite.com/file.exe,12345abcdef,123,,,
callback,bad.com,,567,,,
What I need to do is iterate through the CSV file, maintaining header order, and take a different action if the result is a binary or callback. For this example, if the result is a binary I'll return an arbitrary "clean" or "dirty" rating and if it's a callback I'll just print out the details.
Below is the code I'm currently planning to use, but I'm new to Python and would like feedback on the code and if there is a better way to accomplish this. I'm also not fully clear on the difference between how I'm handling if the result is binary: for k in (k for k in r.fieldnames if (not k.startswith("""__mv_""") and not k.startswith("""_mkv_"""))) and how I handle if it's not. Both achieve the same result, so whats the benefit of one over the other?
import gzip
import csv
import json
csv_file = 'test_csv.csv.gz'
class GZipCSVReader:
def __init__(self, filename):
self.gzfile = gzip.open(filename)
self.reader = csv.DictReader(self.gzfile)
self.fieldnames = self.reader.fieldnames
def next(self):
return self.reader.next()
def close(self):
self.gzfile.close()
def __iter__(self):
return self.reader.__iter__()
def get_rating(hash):
if hash == "12345abcdef":
rating = "Dirty"
else:
rating = "Clean"
return hash, rating
def print_callback(result):
print json.dumps(result, sort_keys=True, indent=4, separators=(',',':'))
def process_results_content(r):
for row in r:
values = {}
values_misc = {}
if row["Category"] == "binary":
# Iterate through key:value pairs and add to dictionary
for k in (k for k in r.fieldnames if (not k.startswith("""__mv_""") and not k.startswith("""_mkv_"""))):
v = row[k]
values[k] = v
rating = get_rating(row["Hash"])
if rating[1] == "Dirty":
print rating
else:
for k in r.fieldnames:
if not k.startswith("""__mv_""") and not k.startswith("""_mkv_"""):
v = row[k]
values_misc[k] = v
print_callback(values_misc)
r.close()
if __name__ == '__main__':
r = GZipCSVReader(csv_file)
process_results_content(r)
Finally, would a for...else loop be better rather than doing something such as if row["Category"] == "binary"? For example, could I do something such as:
def process_results_content(r):
for row in r:
values = {}
values_misc = {}
for k in (k for k in r.fieldnames if (not row["Category"] == "binary")):
v = row[k]
...
else:
v = row[k]
...
Seems like that would be the same logic where the first clause would capture anything not binary and the second would capture everything else, but does not seem to produce the correct result.
My take using the pandas library.
Code:
import pandas as pd
csv_file = 'test_csv.csv'
df = pd.read_csv(csv_file)
df = df[["Category","URL","Hash","ID"]] # Remove the other columns.
get_rating = lambda x: "Dirty" if x == "12345abcdef" else "Clean"
df["Rating"] = df["Hash"].apply(get_rating) # Assign a value to each row based on Hash value.
print df
j = df.to_json() # Self-explanatory. :)
print j
Result:
Category URL Hash ID Rating
0 binary somebadsite.com/file.exe 12345abcdef 123 Dirty
1 callback bad.com NaN 567 Clean
{"Category":{"0":"binary","1":"callback"},"URL":{"0":"somebadsite.com\/file.exe","1":"bad.com"},"Hash":{"0":"12345abcdef","1":null},"ID":{"0":123,"1":567},"Rating":{"0":"Dirty","1":"Clean"}}
If this is your intended result, then just substitute the above to your GZipReader, since I did not emulate the opening of the gzip file.

Categories

Resources