Showing duplicates in columns of csv files - python

I am trying to read a particular column ("Labels") of any .csv file in a path. Then I want to print each duplicate and the number of times that duplicate appeared.
import os
import csv
from collections import Counter
items = []
directory = os.path.join("c:\\","Users\Bob\Desktop\CSVs")
for root,dirs,files in os.walk(directory):
for file in files:
if file.endswith(".csv"):
with open(file) as csvFile:
reader = csv.DictReader(file)
for row in reader:
items.append(row["Labels"])
print(row)
counted = dict(Counter(items))
print(counted)
I get the following error
File "C:/Users/Bob/Desktop/CSVs/Dupe Check.py", line 14, in <module>
items.append(row["Labels"])
KeyError: 'Labels'
The labels column is always the second column of the csv files.

The problem is you're reading in the file name and not the file object; therefore, it couldn't find the word key "Labels".
with open(file) as csvFile:
reader = csv.DictReader(file)
Try replacing file with csvFile instead.
with open(file) as csvFile:
reader = csv.DictReader(csvFile)
If you printed out the variable reader, you'll have a better understanding.

Related

Writing and appending multiple csv data into new csv using python

I have a directory where there are multiple csv files. Currently I am able to read all the files sequentially using for loop and display their contents.
I need to to write the contents from all the csv files sequentially into a new csv file but I am missing something as in my new csv has no data in it.
this is what I am doing :
import os
import csv
path = r'C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ\\'
fileNames = os.listdir(path)
for f in fileNames:
file = open(path+f)
csvreader = csv.reader(file)
rows = []
for row in csvreader:
rows.append(row)
for i in rows:
print(i)
#OFile = open('C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ\ALL_DATA.csv','w')
writer = csv.writer(open('C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ\ALL_DATA.csv', 'wb'))
#for row in csvreader:
# row1 = csvreader.next()
writer.writerow(i)
You are overwriting the file each row you try to write.
Using the w argument for the open method will overwrite existing files.
The argument you need to use in the case you want to append to files (or create new files if non-existing) is a
See Python File Open for more informations about python file modes.
import os
import csv
path = r'C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ'
fileNames = os.listdir(path)
with open('C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ\ALL_DATA.csv', 'a') as output:
writer = csv.writer(output)
for f in fileNames:
with open(os.path.join(path, f), "r") as file:
csvreader = csv.reader(file)
for row in csvreader:
print(row)
writer.writerow(row)
If the csv files have the same columns and formats you could also simply copy the first file and append the others, excluding their headers.
import os
import shutil
path = r'C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ'
fileNames = os.listdir(path)
output = r'C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ\ALL_DATA.csv'
# Copy the first file:
shutil.copyfile(os.path.join(path,fileNames[0]), output)
# Append the remaining file contents, excluding each first line
with open(output, 'a') as out:
for file in fileNames[1:]:
with open(os.path.join(path, file), 'r') as in_:
out.write(''.join(in_.readlines()[1:]))

How to read two csv files and to concatenate them?

First, I need to import two csv files.
Then I need to remove header in both files.
After that, I would like to take one column from both files and to concatenate them.
I have tried to open files, but I'm not sure how to concatenate.
Can anyone give advice how to proceed?
import csv
x = []
chamber_temperature = []
with open(r"C:\Users\mm02058\Documents\test.txt", 'r') as file:
reader = csv.reader(file, delimiter='\t')
with open(r"C:\Users\mm02058\Documents\test.txt", 'r') as file1:
reader_1 = csv.reader(file1, delimiter='\t')
for row in (reader):
x.append(row[0])
chamber_temperature.append(row[1])
for row in (reader_1):
x.append(row[0])
chamber_temperature.append(row[1])
The immediate bug is that you are trying to read from reader1 outside the with block, which means Python has already closed the file.
But the nesting of the with calls is just confusing and misleading anyway. Here is a generalization which should allow you to extend with more new files easily.
import csv
x = []
chamber_temperature = []
for filename in (r"C:\Users\mm02058\Documents\test.txt",
r"C:\Users\mm02058\Documents\test.txt"):
with open(filename, 'r') as file:
for idx, row in enumerate(csv.reader(file, delimiter='\t')):
if idx == 0:
continue # skip header line
x.append(row[0])
chamber_temperature.append(row[1])
Because of how you have structured your code, the context manager for file1 will close the file before it has been used by the for loop.
Use a single context manager to open both files e.g
with open('file1', 'r') as file1, open('file2', 'r') as file2:
# Your code in here
for row in (reader_1):
x.append(row[0])
chamber_temperature.append(row[1])
You are getting this error because you have placed this codeblock outside the 2nd loop and now the file has been closed.
You can either open both the files at once with this
with open('file1', 'r') as file1, open('file2', 'r') as file2:
# Your code in here
or you can use pandas for opening and concatenating csv files
import pandas as pd
data = pd.read_csv(r'file.csv', header=None)
and then refer here Concatenate dataframes

I have a csv file and i want to extract each row of csv file into different csv file . how can i do that?

I have a CSV file and I want to extract each row of CSV file into the different CSV files. how can I do that?
Like this, it will be saved in files numerated by number of row
import csv
with open('file.csv', 'r') as csv_file:
rows = csv.reader(csv_file, skipinitialspace=True)
for i, row in enumerate(rows):
with open('file_{}.csv'.format(i), 'w') as write_file:
writer = csv.writer(write_file)
writer.writerow(row)

how to remove a specific row in csv file based upon the duplicate using python?

I have a csv file which has many rows looks like below.
20170718 014418.475476 [UE:142 CRNTI : 446]
20170718 094937.865362 [UE:142 CRNTI : 546]
Above are the sample two rows of the csv file.
Now if we see the rows there is a string called [UE:142...] which repeats in the csv file.
Problem statement:
I want to remove the duplicate row which contains string [UE:< > more than once in that csv file i.e in the above rows the string [UE:142 repeated twice so the second one must get deleted, in this way there are many random strings like [UE:142 .
Can anyone please help me with python script for the above problem statement?
import csv
reader = open("test.csv", "r")
lines = reader.read().split(" ")
reader.close()
writer = open("test_1.csv", "w")
for line in set(lines):
writer.write(line)
writer.close()
from csv import reader, writer as csv_writer
csv_path = '<your csv file path here>'
def remove_duplicate_ue (csv_path):
found = False
with open (csv_path, 'r') as csv_file:
for line in reader (csv_file, delimiter = ' '):
if 'UE:' not in line [-1]:
yield line
elif not found:
yield line
found = True
def write_csv (csv_path, rows, delimiter = ' '):
with open (csv_path, 'w') as csv_file:
writer = csv_writer (csv_file, delimiter = delimiter)
for row in rows:
writer.writerow (row)
write_csv (csv_path, tuple (remove_duplicate_ue (csv_path)))

Python Read Text File Column by Column

So I have a text file that looks like this:
1,989785345,"something 1",,234.34,254.123
2,234823423,"something 2",,224.4,254.123
3,732847233,"something 3",,266.2,254.123
4,876234234,"something 4",,34.4,254.123
...
I'm running this code right here:
file = open("file.txt", 'r')
readFile = file.readline()
lineID = readFile.split(",")
print lineID[1]
This lets me break up the content in my text file by "," but what I want to do is separate it into columns because I have a massive number of IDs and other things in each line. How would I go about splitting the text file into columns and call each individual row in the column one by one?
You have a CSV file, use the csv module to read it:
import csv
with open('file.txt', 'rb') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
This still gives you data by row, but with the zip() function you can transpose this to columns instead:
import csv
with open('file.txt', 'rb') as csvfile:
reader = csv.reader(csvfile)
for column in zip(*reader):
Do be careful with the latter; the whole file will be read into memory in one go, and a large CSV file could eat up all your available memory in the process.

Categories

Resources