Python for loop enumerate - python

I am reading multiple csv files and combine it in one csv file. The desired outcome of the combined data looks like the following:
0 4 6 8 10 12
1 2 5 4 2 1
5 3 0 1 5 10
....
But in the following code, I intend the column to go from 0,4,6,8,10,12.
for indx, file in enumerate(files_File1):
if file.endswith('csv'): #reading csv filed in the designated folder
filepath = os.path.join(folder_File1, file) #reading csv filed in the designated folder
current = pd.read_csv(filepath, header=None) #reading csv filed in the designated folder
if indx == 0:
mydata_File1 = current.copy()
mydata_File1.columns.values[1] = 4
print(mydata_File1.columns.values)
else:
mydata_File1[2*indx+4] = current.iloc[:,1]
print(mydata_File1.columns.values)
But instead, the outcome looks like this where the column goes from 0,2,4,6,8,10,12.
0 4 2 6 8 10 12
1 2 5 4 2 1
5 3 0 1 5 10
....
I am not quite sure what causes the column named "2".
Any idea?

If there is some reason you need panda, then this will work. Your code references mydata_File1.columns.values which is the name of the columns, not the value in the columns. If this doesn't answer your question, then please provide more complete answer per #juanpa.arrivillaga comment.
#! python3
import os
import pandas as pd
import glob
folder_File1 = r"C:\Users\Public\Documents\Python\CombineCSVFiles"
csv_only = r"\*.csv"
files_File1 = glob.glob(f'{folder_File1}{csv_only}')
new_csv = f'{folder_File1}\\newcsv.csv'
mydata_File1 = []
for indx, file in enumerate(files_File1):
if file == new_csv:
pass
else:
current = pd.read_csv(file, header=None) #reading csv filed in the designated folder
print (current)
if indx == 0:
mydata_File1 = current.copy()
print(mydata_File1.values)
else:
pass
mydata_File1 = mydata_File1.append(current, ignore_index=True)
print(mydata_File1.values)
mydata_File1.to_csv(new_csv)

If you are really just trying to combine .csv files, no need for panda.
#! python3
import glob
folder_File1 = r"C:\Users\Public\Documents\Python\CombineCSVFiles"
csv_only = r"\*.csv"
files_File1 = glob.glob(f'{folder_File1}{csv_only}')
new_csv = f'{folder_File1}\\newcsv.csv'
lines = []
for file in files_File1:
with open(file) as filein:
if filein.name == new_csv:
pass
else:
for line in filein:
line = line.strip() # or some other preprocessing
lines.append(line) # storing everything in memory!
with open(new_csv, 'w') as out_file:
out_file.writelines(line + u'\n' for line in lines)

Related

Comparing 2 Huge csv Files in Python

I have 2 csv files.
File1:
EmployeeName,Age,Salary,Address
Vinoth,12,2548.245,"140,North Street,India"
Vinoth,12,2548.245,"140,North Street,India"
Karthick,10,10.245,"140,North Street,India"
File2:
EmployeeName,Age,Salary,Address
Karthick,10,10.245,"140,North Street,India"
Vivek,20,2000,"USA"
Vinoth,12,2548.245,"140,North Street,India"
I want to compare these 2 files and report the differences into another csv file. I've used the below python code ( version 2.7)
#!/usr/bin/env python
import difflib
import csv
with open('./Input/file1', 'r' ) as t1:
fileone = t1.readlines()
with open('./Input/file2', 'r' ) as t2:
filetwo = t2.readlines()
with open('update.csv', 'w') as outFile:
for line in filetwo:
if line not in fileone:
outFile.write(line)
for line in fileone:
if line not in filetwo:
outFile.write(line)
When I execute, below is the output I got:
Actual Output
Vivek,20,2000,"USA"
But my expected output is below since the Records for "Vinoth" in file1 is present 2 times, but only present 1 time in file2.
Expected Output
Vinoth,12,2548.245,"140,North Street,India"
Vivek,20,2000,"USA"
Questions
Please let me know how to get the expected output.
Also , how to get the Filename and line number of the difference record to the output file?
The issue you are running into is that the in keyword only checks for the presence of an item, not if the item exists twice. If you are open to using an external package, you can do this pretty quickly with pandas.
import pandas as pd
df1 = pd.read_csv('Input/file1.csv')
df2 = pd.read_csv('Input/file2.csv')
# create a new column with the count of how many times the row exists
df1['count'] = 0
df2['count'] = 0
df1['count'] = df1.groupby(df1.columns.to_list()[:-1]).cumcount() + 1
df2['count'] = df2.groupby(df2.columns.to_list()[:-1]).cumcount() + 1
# merge the two data frames with and outer join, add an indicator variable
# to show where each row (including the count) exists.
df_all = df1.merge(df2, on=df1.columns.to_list(), how='outer', indicator='exists')
print(df_all)
# prints:
EmployeeName Age Salary Address count exists
0 Vinoth 12 2548.245 140,North Street,India 1 both
1 Vinoth 12 2548.245 140,North Street,India 2 left_only
2 Karthick 10 10.245 140,North Street,India 1 both
3 Vivek 20 2000.000 USA 1 right_only
# clean up exists column and export the rows do not exist in both frames
df_all['exists'] = (df_all.exists.str.replace('left_only', 'file1')
.str.replace('right_only', 'file2'))
df_all.query('exists != "both"').to_csv('update.csv', index=False)
Edit: non-pandas version
You can check for difference in identical line counts using the row as a key and the count as the value.
from collection import defaultdict
c1 = defaultdict(int)
c2 = defaultdict(int)
with open('./Input/file1', 'r' ) as t1:
for line in t1:
c1[line.strip()] += 1
with open('./Input/file2', 'r' ) as t2:
for line in t2:
c2[line.strip()] += 1
# create a set of all rows
all_keys = set()
all_keys.update(c1)
all_keys.update(c2)
# find the difference in the number of instances of the row
out = []
for k in all_keys:
diff = c1[k] - c2[k]
if diff == 0:
continue
if diff > 0:
out.extend([k + ',file1'] * diff) # add which file it came from
if diff < 0:
out.extend([k + ',file2'] * abs(diff)) # add which file it came from
with open('update.csv', 'w') as outFile:
outFile.write('\n'.join(out))
use panda compare
import pandas as pd
f1 = pd.read_csv(file_1.csv)
f2 = pd.read_csv(file_2.csv)
changed = f1.compare(f2)
change = f1[f1.index.isin(changed.index)]
print(change)

Create and Append CSV file in Python with header once

I am creating CSV file using Python code.
I am able to create and store data . But I am unable to Add Header i.e column names in my csv file.
I have Data frame created .
Code for creating and appending csv is as follows:
main_app.py:
def handle_data(data):
msg5 = data.replace("\n"," ").replace("\r"," ").split(",")
print(msg5)
d = dict(s.split(':') for s in msg5)
data_frame = pd.DataFrame(list(d.items())).transpose()
data_frame.columns = data_frame.iloc[0]
data_frame = data_frame.reindex(data_frame.index.drop(0))
print(data_frame)
filename = (value_text + time2 + ".csv")
#print(filename)
fields = list(data_frame.columns)
with open(filename, 'a',newline='') as writeFile:
writeFile = csv.writer(writeFile)
writeFile.writerow(fields)
writeFile.writerows(data_frame.values)
def read_from_port(ser):
while True:
reading = ser.readline()
handle_data(reading.decode("utf-8"))```
read_from_port(serial_port)
This code adds the column names every iteration:
Output I get:
A B C
0 0 0
A B C
1 1 1
.......
Output I need is:
A B C
0 0 0
1 1 1
2 2 2
.....
Can Some one help me out.
Thanks in Advance.
csv.DictWriter can do that for you!
csvfile = csv.DictWriter(writeFile, fieldnames=fields)
csvfile.writeheader()
csvfile.writerows(data_frame.values)

Two type of headers txt to Pandas dataframe

Let's say I have a .txt file like that:
#D=H|ID|STRINGIDENTIFIER
#D=T|SEQ|DATETIME|VALUE
H|879|IDENTIFIER1
T|1|1569972384|7
T|2|1569901951|9
T|3|1569801600|8
H|892|IDENTIFIER2
T|1|1569972300|109
T|2|1569907921|101
T|3|1569803600|151
And I need to create a dataframe like this:
IDENTIFIER SEQ DATETIME VALUE
879_IDENTIFIER1 1 1569972384 7
879_IDENTIFIER1 2 1569901951 9
879_IDENTIFIER1 3 1569801600 8
892_IDENTIFIER2 1 1569972300 109
892_IDENTIFIER2 2 1569907921 101
892_IDENTIFIER2 3 1569803600 151
What would be the possible code?
A basic way to do it might just to be to process the text file and convert it into a csv before using the read_csv function in pandas. Assuming the file you want to process is as consistent as the example:
import pandas as pd
with open('text.txt', 'r') as file:
fileAsRows = file.read().split('\n')
pdInput = 'IDENTIFIER,SEQ,DATETIME,VALUE\n' #addHeader
for row in fileAsRows:
cols = row.split('|') #breakup row
if row.startswith('H'): #get identifier info from H row
Identifier = cols[1]+'_'+cols[2]
if row.startswith('T'): #get other info from T row
Seq = cols[1]
DateTime = cols[2]
Value = cols[3]
tempList = [Identifier,Seq, DateTime, Value]
pdInput += (','.join(tempList)+'\n')
with open("pdInput.csv", "a") as file:
file.write(pdInput)
## import into pandas
df = pd.read_csv("pdInput.csv")

How to read and write to CSV Files in Python

I have a csv file, which has only a single column , which acts as my input.
I use that input to find my outputs. I have multiple outputs and I need those outputs in another csv file.
Can anyone please suggest me the ways on how to do it ?
Here is the code :
import urllib.request
jd = {input 1}
//
Some Codes to find output - a,b,c,d,e
//
** Code to write output to a csv file.
** Repeat the code with next input of input csv file.
Input CSV File has only a single column and is represented below:
1
2
3
4
5
Output would in a separate csv in a given below format :
It would be in multiple rows and multiple columns format.
a b c d e
Here is a simple example:
The data.csv is a csv with one column and multiple rows.
The results.csv contain the mean and median of the input and is a csv with 1 row and 2 columns (mean is in 1st column and median in 2nd column)
Example:
import numpy as np
import pandas as pd
import csv
#load the data
data = pd.read_csv("data.csv", header=None)
#calculate things for the 1st column that has the data
calculate_mean = [np.mean(data.loc[:,0])]
calculate_median = [np.median(data.loc[:,0])]
results = [calculate_mean, calculate_median]
#write results to csv
row = []
for result in results:
row.append(result)
with open("results.csv", "wb") as file:
writer = csv.writer(file)
writer.writerow(row)
In pseudo code, you'll do something like this:
for each_file in a_folder_that_contains_csv: # go through all the `inputs` - csv files
with open(each_file) as csv_file, open(other_file) as output_file: # open each csv file, and a new csv file
process_the_input_from_each_csv # process the data you read from the csv_file
export_to_output_file # export the data to the new csv file
Now, I won't write a full-working example because it's better for you to start digging and ask specific questions when you have some. You're now just asking: write this for me because I don't know Python.
here is the official documentation
here you can read about the csv module
here you can read about the os module
I think you need read_csv for reading file to Series and to_csv for writing output Series to file in looping by Series.iteritems.
#file content
1
3
5
s = pd.read_csv('file', squeeze=True, names=['a'])
print (s)
0 1
1 3
2 5
Name: a, dtype: int64
for i, val in s.iteritems():
#print (val)
#some operation with scalar value val
df = pd.DataFrame({'a':np.arange(val)})
df['a'] = df['a'] * 10
print (df)
#write to csv, file name by val
df.to_csv(str(val) + '.csv', index=False)
a
0 0
a
0 0
1 10
2 20
a
0 0
1 10
2 20
3 30
4 40

Deleting a range of csv columns

I am trying to create an importable module to delete a range of columns (specifically columns 73-177 in the file I am working with).I am attempting to edit this file i/o code that is written for removing a column based on the field name. I want to modify this code to delete columns 73-177 in a csv file. What do I need to do to accomplish this?
def removeColumns(num1, num2, inputFILE, FileName):
inPUTfile = open(inputFILE, 'r')
outPUTfile = open(FileName, 'w')
line = inPUTfile.readline()
while line:
# Delete Specified columns. First column range number, second column range number (+1)
lineList = line.split('\t')
removeCOL = "Calendar-Year"
i = 0
while lineList[i] != removeCOL: #(linesout?):
i = i + 1
lineList.pop(i) #remove these fields from the list.append
#write modified fields
remove = "\t".join(lineList)
outPUTfile.write(line) #write the new field names outfile
for line in inPUTfile: #remove field i from each remaining line and write it in the output file &modify input line
lineList = line.split( ) #convert to a list
lineList.pop(i) #remove fields from the list
line = '\t'.join(lineList)
line = line + '\n' #add a carriage return to the end of the row
outPUTfile.write(line)# Write the modified line in the output file
inPUTfile.close() #close the input file
outPUTfile.close() #close the output file
return outPUTfile
print outPUTfile
I realize that you asked how to modify the original code, but here I honestly think it'd be easier to understand how to do it a different way. Python has a useful csv module which handles a lot of the work for you. Something like:
import csv
remove_from = 2
remove_to = 5
with open("to_delete.csv", "rb") as fp_in, open("newfile.csv", "wb") as fp_out:
reader = csv.reader(fp_in, delimiter="\t")
writer = csv.writer(fp_out, delimiter="\t")
for row in reader:
del row[remove_from:remove_to]
writer.writerow(row)
will turn
$ cat to_delete.csv
a b c d e f g
1 2 3 4 5 6 7
8 9 10 11 12 13 14
15 16 17 18 19 20 21
into
$ cat newfile.csv
a b f g
1 2 6 7
8 9 13 14
15 16 20 21

Categories

Resources