Find duplicates in a column, then add values in adjacent column - python

I have a csv file that has a one word title and a description that is always a number.
My current code extracts just the title an description to another csv file and then converts the csv into an excel file.
import csv
import output
f = open("Johnny_Test-punch_list.csv")
csv_f = csv.reader(f)
m = open('data.csv', "w")
for row in csv_f:
m.write(row[1])
m.write(",")
m.write(row[3])
m.write("\n")
m.close()
output.toxlsx()
How can I look for matching Titles and then add the descriptions of the titles?

import csv
import output
f = open("Johnny_Test-punch_list.csv")
csv_f = csv.reader(f)
m = open('data.csv', "w")
dict_out = {}
for row in csv_f:
if row[1] in dict_out:
dict_out[row[1]] += row[3]
else:
dict_out[row[1]] = row[3]
for title, value in dict_out.iteritems():
m.write('{},{}\n'.format(title, value))

If I understood you correctly, you need to write in a single line as a string.
can you try with below code:
for row in csv_f:
m.write(row[1] + "," + str(row[3]) + "\n")

Related

Pivot a CSV string using python without using pandas or any similar library

You may think of this one as another redundant question asked, but I tried to go through all similar questions asked, no luck so far. In my specific use-case, I can't use pandas or any other similar library for this operation.
This is what my input looks like
AttributeName,Value
Name,John
Gender,M
PlaceofBirth,Texas
Name,Alexa
Gender,F
SurName,Garden
This is my expected output
Name,Gender,Surname,PlaceofBirth
John,M,,Texas
Alexa,F,Garden,
So far, I have tried to store my input into a dictionary and then tried writing it to a csv string. But, it is failing as I am not sure how to incorporate missing column values conditions. Here is my code so far
reader = csv.reader(csvstring.split('\n'), delimiter=',')
csvdata = {}
csvfile = ''
for row in reader:
if row[0] != '' and row[0] in csvdata and row[1] != '':
csvdata[row[0]].append(row[1])
elif row[0] != '' and row[0] in csvdata and row[1] == '':
csvdata[row[0]].append(' ')
elif row[0] != '' and row[1] != '':
csvdata[row[0]] = [row[1]]
elif row[0] != '' and row[1] == '':
csvdata[row[0]] = [' ']
for key, value in csvdata.items():
if value == ' ':
csvdata[key] = []
csvfile += ','.join(csvdata.keys()) + '\n'
for row in zip(*csvdata.values()):
csvfile += ','.join(row) + '\n'
For the above code as well, I took some help here. Thanks in advance for any suggestions/advice.
Edit #1 : Update code to imply that I am doing processing on a csv string instead of a csv file.
What you need is something like that:
import csv
with open("in.csv") as infile:
buffer = []
item = {}
lines = csv.reader(infile)
for line in lines:
if line[0] == 'Name':
buffer.append(item.copy())
item = {'Name':line[1]}
else:
item[line[0]] = line[1]
buffer.append(item.copy())
for item in buffer[1:]:
print item
If none of the attributes is mandatory, I think #framontb solution needs to be rearranged in order to work also when Name field is not given.
This is an import-free solution, and it's not super elegant.
I assume you have lines already in this form, with this columns:
lines = [
"Name,John",
"Gender,M",
"PlaceofBirth,Texas",
"Gender,F",
"Name,Alexa",
"Surname,Garden" # modified typo here: SurName -> Surname
]
cols = ["Name", "Gender", "Surname", "PlaceofBirth"]
We need to distinguish one record from another, and without mandatory fields the best I can do is start considering a new record when an attribute has already been seen.
To do this, I use a temporary list of attributes tempcols from which I remove elements until an error is raised, i.e. new record.
Code:
csvdata = {k:[] for k in cols}
tempcols = list(cols)
for line in lines:
attr, value = line.split(",")
try:
csvdata[attr].append(value)
tempcols.remove(attr)
except ValueError:
for c in tempcols: # now tempcols has only "missing" attributes
csvdata[c].append("")
tempcols = [c for c in cols if c != attr]
for c in tempcols:
csvdata[c].append("")
# write csv string with the code you provided
csvfile = ""
csvfile += ",".join(csvdata.keys()) + "\n"
for row in zip(*csvdata.values()):
csvfile += ",".join(row) + "\n"
>>> print(csvfile)
Name,PlaceofBirth,Surname,Gender
John,Texas,,M
Alexa,,Garden,F
While, if you want to sort columns according to your desired output:
csvfile = ""
csvfile += ",".join(cols) + "\n"
for row in zip(*[csvdata[k] for k in cols]):
csvfile += ",".join(row) + "\n"
>>> print(csvfile)
Name,Gender,Surname,PlaceofBirth
John,M,,Texas
Alexa,F,Garden,
This works for me:
with open("in.csv") as infile, open("out.csv", "w") as outfile:
incsv, outcsv = csv.reader(infile), csv.writer(outfile)
incsv.__next__() # Skip 1st row
outcsv.writerows(zip(*incsv))
Update: For input and output as strings:
import csv, io
with io.StringIO(indata) as infile, io.StringIO() as outfile:
incsv, outcsv = csv.reader(infile), csv.writer(outfile)
incsv.__next__() # Skip 1st row
outcsv.writerows(zip(*incsv))
print(outfile.getvalue())

list index out of range CSV for twitter

I'm trying to create a data structure for each tweet and I have troubles with the following code:
with open('tweets.csv', 'rb') as csvfile:
reader = csv.reader('tweets.csv', delimiter=',')
next(reader)
for row in reader:
tweet= dict()
tweet['ID'] = row[0]
tweet['Tweet'] = row[1]
tweet['Tweet cleaned'] = row[2]
tweet['Ticker'] = row[3]
tweet['date'] = int(float(row[4]))
tweet['Return'] = int(float(row[5]))
It returns an error on the line tweet['Tweet'] = row[1] : list index out of range.
I'm new to python and after googling the solution couldn't really find anything that works. Thank you!
f=open('tweets.csv','r')
reader=csv.reader(f, delimiter=',')
l1=list(reader)
for row in l1:
tweet=dict()
tweet['id']=row[0]
tweet['Tweet']=row[1]
tweet['tweet cleaned']=row[2]

Write number of rows to file name

I dont have any idea how to update my code below to the level that I can write the amount of changed row - in the CSV - to the file name.
I have did some stuff with count and row, but it is making any sense.
Can somebody give me some tips
import csv
import glob
import os
rows = []
for in_csv in glob.glob('C:/code/convert/Image/In/*.csv'):
print in_csv
with open(in_csv, 'rb') as f_input:
reader = csv.reader(f_input)
try:
all = []
row = next(reader)
row.insert(0, 'l_panoramic')
all.append(row)
for k, row in enumerate(reader):
all.append(['l_panoramic_{:06}'.format(k)] + row)
name, ext = os.path.splitext(in_csv)
with open("{}_{}{}".format(name, len(rows), ext), 'wb') as f_output:
writer = csv.writer(f_output, delimiter = ';')
writer.writerows(all)
print " {} lines found".format(len(rows))
except StopIteration as e:
print " No lines found"
This could be done using the glob library to create your list of CSV files. Use splitext() to take then existing filename and split it into a filename and extension, the number of rows can then be easily added using a format() statement as follows:
import csv
import glob
import os
rows = []
for in_csv in glob.glob('a*.csv'):
print in_csv
with open(in_csv, 'rb') as f_input:
reader = csv.reader(f_input)
try:
row = next(reader)
row.insert(0, 'L_panoramic')
rows.append(row)
for k, row in enumerate(reader):
rows.append(['l_panoramic_{:06}'.format(k)] + row)
name, ext = os.path.splitext(in_csv)
with open("{}_{}{}".format(name, len(rows), ext), 'wb') as f_output:
writer = csv.writer(f_output, delimiter = ';')
writer.writerows(rows)
print " {} lines found".format(len(rows))
except StopIteration as e:
print " No lines found"
You were already creating a list of rows to be written, so once this list is complete, you will know how many rows there are. With this you can then open the output file with the number of rows added to the name, and write all the rows to it.

(Simple Python) CSV input to usernames

I have a CSV file names.csv
First_name, Last_name
Mike, Hughes
James, Tango
, Stoke
Jack,
....etc
What I want is to be able to take the first letter of the First_name and the full Last_name and output it on screen as usernames but not include the people with First_name and Last_name property's empty. I'm completely stuck any help would be greatly appreciated
import csv
ifile = open('names.csv', "rb")
reader = csv.reader(ifile)
rownum = 0
for row in reader:
if rownum == 0:
header = row
else:
colnum = 0
for col in row:
print '%-8s: %s' % (header[colnum], col)
colnum += 1
rownum += 1
ifile.close()
Attempt #2
import csv
dataFile = open('names.csv','rb')
reader = csv.reader(dataFile)
next(reader, None)
for row in reader:
if (row in reader )
print (row[0])
I haven't saved many attempts because none of them have worked :S
import csv
dataFile = open('names.csv','rb')
reader = csv.reader(dataFile, delimiter=',', quoting=csv.QUOTE_NONE)
for row in reader:
if not row[0] or not row[1]:
continue
print (row[0][0] + row[1]).lower()
Or
import csv
dataFile = open('names.csv','rb')
reader = csv.reader(dataFile, delimiter=',', quoting=csv.QUOTE_NONE)
[(row[0][0] + row[1]).lower() for row in reader if
row[0] and row[1]]
Once you get the text from the .csv you can use the split() function to break up the text by the new lines. Your sample text is a little inconsistent, but if I understand you question correctly you can say
import csv
dataFile = open('names.csv','rb')
reader = csv.reader(dataFile)
reader = reader.split('\n')
for x in reader
print(reader[x])
Or if you want to break it up by commas just replace the '\n' with ','
Maybe like this
from csv import DictReader
with open('names.csv') as f:
dw = DictReader(f, skipinitialspace=True)
fullnames = filter(lambda n: n['First_name'] and n['Last_name'], dw)
for f in fullnames:
print('{}{}'.format(f['First_name'][0], f['Last_name']))
You have headings in your csv so use a DictReader and just filter out those whose with empty first or last names and display the remaining names.

Search for string in CSV Files using python and write the results

#!/usr/bin/python
import csv
import re
string_1 = ('OneTouch AT')
string_2 = ('LinkRunner AT')
string_3 = ('AirCheck')
#searched = ['OneTouch AT', 'LinkRunner AT', 'AirCheck']
print "hello Pythong! "
#def does_match(string):
# stringl = string.lower()
# return any(s in stringl for s in searched)
inFile = open('data.csv', "rb")
reader = csv.reader(inFile)
outFile = open('data2.csv', "wb")
writer = csv.writer(outFile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)
for row in reader:
found = False
for col in row:
if col in [string_1, string_2, string_3] and not found:
writer.writerow(row)
found = True
#for row in reader:
# if any(does_match(col) for col in row):
# writer.writerow(row[:2]) # write only 2 first columns
inFile.close()
outFile.close()
I'm trying to figure out how to search a CSV file for 3 items. If those items exist print the row. Ideally I would like only Columns 1 and 3 to print to a new file.
Sample Data File
LinkRunner AT Video,10,20
Wireless Performance Video OneTouch AT,1,2
Wired OneTouch AT,200,300
LinkRunner AT,200,300
AirCheck,200,300
I'm trying to figure out how to search a CSV file for 3 items. If
those items exist print the row. Ideally I would like only Columns 1
and 3 to print to a new file.
Try this:
import csv
search_for = ['OneTouch AT','LinkRunner AT','AirCheck']
with open('in.csv') as inf, open('out.csv','w') as outf:
reader = csv.reader(inf)
writer = csv.writer(outf, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in reader:
if row[0] in search_for:
print('Found: {}'.format(row))
writer.writerow(row)
#!/usr/bin/python
import csv
import numpy as np
class search_csv(object):
def __init__(self, infile, outfile):
infile = open(infile, 'rb')
read_infile = [i for i in csv.reader(infile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)]
self.non_numpy_data = read_infile
self.data = np.array(read_infile, dtype=None)
self.outfile = open(outfile, 'wb')
self.writer_ = csv.writer(self.outfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
def write_to(self, matched_values):
self.writer_.writerows(matched_values)
print ' Matched Values Written '
return True
def searcher(self, items, return_cols=[0,2]): ##// items should be passed as list -> ['OneTouch AT', 'LinkRunner AT', 'AirCheck']
find_these = np.array(items, dtype=None)
matching_y = np.in1d(self.data, find_these).reshape(self.data.shape).nonzero()[0]
matching_data = self.data[matching_y][:,return_cols]
self.write_to(matching_data)
self.outfile.close()
return True
def non_numpy_search(self, items, return_cols=[0,2]):
lst = []
for i in self.non_numpy_data:
for ii in items:
if ii in i:
z = []
for idx in return_cols:
z.append(i[idx])
lst.append(z)
break
self.write_to(lst)
return True
### now use the class ###
SEARCHING_FOR = ['OneTouch AT', 'LinkRunner AT', 'AirCheck']
IN_FILE = 'in_file.csv'
OUT_FILE = 'out_file.csv'
non_numpy_search(IN_FILE, OUT_FILE).non_numpy_search(SEARCHING_FOR)
By the phrasing of your question I'm assuming you just want to complete the task at hand and don't really care how. So copy and paste this in and use your data file as the 'IN_FILE' value and the file name you want to write to as the 'OUT_FILE' value. Place the values you want to search for in the 'SEARCHING_FOR' list as you're done.
Things to note....
SEARCHING_FOR should be a list.
the values in SEARCHING_FOR are matched EXACTLY so 'A' will not match 'a'. If you want a to use a regex or something more complex let me know.
In function 'non_numpy_search' there is a 'return_cols' parameter. It defaults to the first and 3rd column.
If you don't have numpy let me know.
#!/usr/bin/python
import csv
import re
import sys
import gdata.docs.service
#string_1 = ('OneTouch AT')
#string_2 = ('LinkRunner AT')
#string_3 = ('AirCheck')
searched = ['aircheck', 'linkrunner at', 'onetouch at']
def find_group(row):
"""Return the group index of a row
0 if the row contains searched[0]
1 if the row contains searched[1]
etc
-1 if not found
"""
for col in row:
col = col.lower()
for j, s in enumerate(searched):
if s in col:
return j
return -1
def does_match(string):
stringl = string.lower()
return any(s in stringl for s in searched)
#Opens Input file for read and output file to write.
inFile = open('data.csv', "rb")
reader = csv.reader(inFile)
outFile = open('data2.csv', "wb")
writer = csv.writer(outFile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)
#for row in reader:
# found = False
# for col in row:
# if col in [string_1, string_2, string_3] and not found:
# writer.writerow(row)
# found = True
"""Built a list of items to sort. If row 12 contains 'LinkRunner AT' (group 1),
one stores a triple (1, 12, row)
When the triples are sorted later, all rows in group 0 will come first, then
all rows in group 1, etc.
"""
stored = []
for i, row in enumerate(reader):
g = find_group(row)
if g >= 0:
stored.append((g, i, row))
stored.sort()
for g, i, row in stored:
writer.writerow(tuple(row[k] for k in (0,2))) # output col 1 & 5
#for row in reader:
# if any(does_match(col) for col in row):
# writer.writerow(row[:2]) # write only 2 first columns
# Closing Input and Output files.
inFile.close()
outFile.close()

Categories

Resources