I have a CSV that looks something like this:
F02303521,"Smith,Andy",GHI,"Smith,Andy",GHI,,,
F04300621,"Parker,Helen",CERT,"Yu,Betty",IOUS,,,
I want to delete all the lines where the 2nd column equal the 4th column (ex. when Smith,Andy = Smith,Andy). I tried to do this in python by using " as the delimiter and splitting the columns into:
F02303521, Smith,Andy ,GHI, Smith,Andy ,GHI,,,
I tried this python code:
testCSV = 'test.csv'
deletionText = 'linestodelete.txt'
correct = 'correctone.csv'
i = 0
j = 0 #where i & j keep track of line number
with open(deletionText,'w') as outfile:
with open(testCSV, 'r') as csv:
for line in csv:
i = i + 1 #on the first line, i will equal 1.
PI = line.split('"')[1]
investigator = line.split('"')[3]
#if they equal each other, write that line number into the text file
as to be deleted.
if PI == investigator:
outfile.write(i)
#From the TXT, create a list of line numbers you do not want to include in output
with open(deletionText, 'r') as txt:
lines_to_be_removed_list = []
# for each line number in the TXT
# remove the return character at the end of line
# and add the line number to list domains-to-be-removed list
for lineNum in txt:
lineNum = lineNum.rstrip()
lines_to_be_removed_list.append(lineNum)
with open(correct, 'w') as outfile:
with open(deletionText, 'r') as csv:
# for each line in csv
# extract the line number
for line in csv:
j = j + 1 # so for the first line, the line number will be 1
# if csv line number is not in lines-to-be-removed list,
# then write that to outfile
if (j not in lines_to_be_removed_list):
outfile.write(line)
but for this line:
PI = line.split('"')[1]
I get:
Traceback (most recent call last):
File "C:/Users/sskadamb/PycharmProjects/vastDeleteLine/manipulation.py", line 11, in
PI = line.split('"')[1]
IndexError: list index out of range
and I thought it would do PI = Smith,Andy investigator = Smith,Andy... why does that not happen?
Any help would be greatly appreciated, thanks!
When you think csv, think pandas, which is a great data analysis library for Python. Here's how to accomplish what you want:
import pandas as pd
fields = ['field{}'.format(i) for i in range(8)]
df = pd.read_csv("data.csv", header=None, names=fields)
df = df[df['field1'] != df['field3']]
print df
This prints:
field0 field1 field2 field3 field4 field5 field6 field7
1 F04300621 Parker,Helen CERT Yu,Betty IOUS NaN NaN NaN
Try splitting on comma, not qoute.
x.split(",")
Related
Using the following Python3 code, I am able to access the first column values but unable to access subsequent columns. The error is:
IndexError: list index out of range
with open('smallSample.txt', 'r') as file:
listOfLines = file.readlines()
for line in listOfLines:
print(line.strip())
header = listOfLines[0] #with all the labels
print(header.strip().split(','))
for row in listOfLines[1:]:
values = row.strip().split(',')
print(values[0]) #Able to access 1st row elements
print(values[1]) #ERROR Unable to access the Second Column Values
'''IndexError: list index out of range'''
The smallSample.txt data stored is:
Date,SP500,Dividend,Earnings,Consumer Price Index,Long Interest Rate,Real Price,Real Dividend,Real Earnings,PE10
1/1/2016,1918.6,43.55,86.5,236.92,2.09,2023.23,45.93,91.22,24.21
2/1/2016,1904.42,43.72,86.47,237.11,1.78,2006.62,46.06,91.11,24
3/1/2016,2021.95,43.88,86.44,238.13,1.89,2121.32,46.04,90.69,25.37```
Actually, your values is not a list. It is re-initialized again and again in for loop. Use this code:
with open('data.txt', 'r') as file:
listOfLines = file.readlines()
for line in listOfLines:
print(line.strip())
header = listOfLines[0] #with all the labels
print(header.strip().split(','))
values = [] # <= look at here
for row in listOfLines[1:]:
values.append(row.strip().split(',')) # <= look at here
print(values[0]) # <= outside for loop
print(values[1])
with open('SP500.txt', 'r') as file:
lines = file.readlines()
#for line in lines:
#print(line)
#header = lines[0]
#labels = header.strip().split(',')
#print(labels)
listOfData = []
totalSP = 0.0
for line in lines[6:18]:
values = line.strip().split(',')
#print(values[0], values[1], values[5])
totalSP = totalSP + float(values[1])
listOfData.append(float(values[5]))
mean_SP = totalSP/12.0
#print(listOfData)
max_interest = listOfData[0]
for i in listOfData:
if i>max_interest:
max_interest = i
I am trying to find few items from a CSV file when I run the code sometimes it works but sometimes it produces error list index out of range
def find_check_in(name,date):
x = 0
f = open('employee.csv','r')
reader = csv.reader(f, delimiter=',')
for row in reader:
id = row[0]
dt = row[1]
v = row[2]
a = datetime.strptime(dt,"%Y-%m-%d")
if v == "Check-In" and id=="person":
x = 1
f.close()
return x
Traceback (most recent call last):
File "", line 51, in
x=find_check_in(name,date)
File "", line 21, in find_check_in
id = row[0]
IndexError: list index out of range
Your CSV file contains blank lines, resulting in row becoming an empty list, in which case there is no index 0, hence the error. Make sure your input CSV has no blank line, or add a condition to process the row only if it isn't empty:
for row in reader:
if row:
# the rest of your code
Seems like reader is returning a row with no elements. Does your data contain any such rows? Or perhaps you need to use the newline='' argument to reader?
https://docs.python.org/3/library/csv.html#csv.reader
I'm working with a .csv file that lists Timestamps in one column and Wind Speeds in the second column. I need to read through this .csv file and calculate the percent of time where wind speed was above 2m/s. Here's what I have so far.
txtFile = r"C:\Data.csv"
line = o_txtFile.readline()[:-1]
while line:
line = oTextfile.readline()
for line in txtFile:
line = line.split(",")[:-1]
How do I get a count of the lines where the 2nd element in the line is greater than 2?
CSV File Sample
You will probably have to update slightly your CSV, depending on the chosen option (for option 1 and option 2, you will definitely want to remove all header rows, whereas for option 3, you will keep only the middle one, i.e. the one that starts with TIMESTAMP).
You actually have three options:
Option 1: Vanilla Python
count = 0
with open('data.csv', 'r') as file:
for line in file:
value = int(line.split(',')[1])
if value > 100:
count += 1
# Now you have the value in ``count`` variable
Option 2: CSV module
Here I use the Python's CSV module (you could as well use the DictReader, but I'll let you do the search yourself).
import csv
count = 0
with open('data.csv', 'r') as file:
reader = csv.read(file, delimiter=',')
for row in reader:
if int(row[1]) > 100:
count += 1
# Now you have the value in ``count`` variable
Option 3: Pandas
Pandas is a really cool, awesome library used by a lot of people to do data analysis. Doing what you want to do would look like:
import pandas as pd
df = pd.read_csv('data.csv')
# Here you are
count = len(df[df['WindSpd_ms'] > 100])
You can read in the file line by line, if something in it, split it.
You count the lines read and how many are above 10m/s - then calculate the percentage:
# create data file for processing with random data
import random
random.seed(42)
with open("data.txt","w") as f:
f.write("header\n")
f.write("header\n")
f.write("header\n")
f.write("header\n")
for sp in random.choices(range(10),k=200):
f.write(f"some date,{sp+3.5}, data,data,data\n")
# open/read/calculate percentage of data that has 10m/s speeds
days = 0
speedGreater10 = 0
with open("data.txt","r") as f:
for _ in range(4):
next(f) # ignore first 4 rows containing headers
for line in f:
if line: # not empty
_ , speed, *p = line.split(",")
# _ and *p are ignored (they take 'some date' + [data,data,data])
days += 1
if float(speed) > 10:
speedGreater10 += 1
print(f"{days} datapoints, of wich {speedGreater10} "+
f"got more then 10m/s: {speedGreater10/days}%")
Output:
200 datapoints, of wich 55 got more then 10m/s: 0.275%
Datafile:
header
header
header
header
some date,9.5, data,data,data
some date,3.5, data,data,data
some date,5.5, data,data,data
some date,5.5, data,data,data
some date,10.5, data,data,data
[... some more ...]
some date,8.5, data,data,data
some date,3.5, data,data,data
some date,12.5, data,data,data
some date,11.5, data,data,data
Below is a snippet from a csv file. The first column is the product number, 2 is the stock level, 3 is the target level, and 4 is the distance from target (target minus stock level.)
34512340,0,95,95
12395675,3,95,92
56756777,70,95,25
90673412,2,95,93
When the stock level gets to 5 or below, I want to have the stock levels updated from python when a user requests it.
I am currently using this piece of code which I have adapted from just updating one line in the CSV. It isn't working though. The first line is written back to the file as 34512340,0,95,95 and the rest of the file is deleted.
choice = input("\nTo update the stock levels of the above products, type 1. To cancel, enter anything else.")
if choice == '1':
with open('stockcontrol.csv',newline='') as f:
for line in f:
data = line.split(",")
productcode = int(data[0])
target = int(data[2])
stocklevel = int(data[1])
if stocklevel <= 5:
target = str(target)
import sys
import csv
data=[]
newval= target
newtlevel = "0"
f=open("stockcontrol.csv")
reader=csv.DictReader(f,fieldnames=['code','level', 'target', 'distancefromtarget'])
for line in reader:
line['level']= newval
line['distancefromtarget']= newtlevel
data.append('%s,%s,%s,%s'%(line['code'],line['level'],line['target'],line['distancefromtarget']))
f.close()
f=open("stockcontrol.csv","w")
f.write("\n".join(data))
f.close()
print("The stock levels were updated successfully")
else:
print("Goodbye")
Here is the code that I had changing one line in the CSV file and works:
with open('stockcontrol.csv',newline='') as f:
for line in f:
if code in line:
data = line.split(",")
target = (data[2])
newlevel = stocklevel - quantity
updatetarget = int(target) - int(newlevel)
stocklevel = str(stocklevel)
newlevel = str(newlevel)
updatetarget = str(updatetarget)
import sys
import csv
data=[]
code = code
newval= newlevel
newtlevel = updatetarget
f=open("stockcontrol.csv")
reader=csv.DictReader(f,fieldnames=['code','level', 'target', 'distancefromtarget'])
for line in reader:
if line['code'] == code:
line['level']= newval
line['distancefromtarget']= newtlevel
data.append('%s,%s,%s,%s'%(line['code'],line['level'],line['target'],line['distancefromtarget']))
f.close()
f=open("stockcontrol.csv","w")
f.write("\n".join(data))
f.close()
What can I change to make the code work? I basically want the program to loop through each line of the CSV file, and if the stock level (column 2) is equal to or less than 5, update the stock level to the target number in column 3, and then set the number in column 4 to zero.
Thanks,
The below code reads each line and checks the value of column 2. If it is less than or equal to 5, the value of column2 is changed to value of column3 and last column is changed to 0 else all the columns are left unchanged.
import sys
import csv
data=[]
f=open("stockcontrol.csv")
reader=csv.DictReader(f,fieldnames=['code','level','target','distancefromtarget'])
for line in reader:
if int(line['level']) <= 5:
line['level']= line['target']
line['distancefromtarget']= 0
data.append("%s,%s,%s,%s"%(line['code'],line['level'],line['target'],line['distancefromtarget']))
f.close()
f=open("stockcontrol.csv","w")
f.write("\n".join(data))
f.close()
Coming to issues in your code:
You are first reading the file without using the csv module and getting the values in each column by splitting the line. You are again using the DictReader method of csv module to read the values you already had.
unique.txt file contains: 2 columns with columns separated by tab. total.txt file contains: 3 columns each column separated by tab.
I take each row from unique.txt file and find that in total.txt file. If present then extract entire row from total.txt and save it in new output file.
###Total.txt
column a column b column c
interaction1 mitochondria_205000_225000 mitochondria_195000_215000
interaction2 mitochondria_345000_365000 mitochondria_335000_355000
interaction3 mitochondria_345000_365000 mitochondria_5000_25000
interaction4 chloroplast_115000_128207 chloroplast_35000_55000
interaction5 chloroplast_115000_128207 chloroplast_15000_35000
interaction15 2_10515000_10535000 2_10505000_10525000
###Unique.txt
column a column b
mitochondria_205000_225000 mitochondria_195000_215000
mitochondria_345000_365000 mitochondria_335000_355000
mitochondria_345000_365000 mitochondria_5000_25000
chloroplast_115000_128207 chloroplast_35000_55000
chloroplast_115000_128207 chloroplast_15000_35000
mitochondria_185000_205000 mitochondria_25000_45000
2_16595000_16615000 2_16585000_16605000
4_2785000_2805000 4_2775000_2795000
4_11395000_11415000 4_11385000_11405000
4_2875000_2895000 4_2865000_2885000
4_13745000_13765000 4_13735000_13755000
My program:
file=open('total.txt')
file2 = open('unique.txt')
all_content=file.readlines()
all_content2=file2.readlines()
store_id_lines = []
ff = open('match.dat', 'w')
for i in range(len(all_content)):
line=all_content[i].split('\t')
seq=line[1]+'\t'+line[2]
for j in range(len(all_content2)):
if all_content2[j]==seq:
ff.write(seq)
break
Problem:
but istide of giving desire output (values of those 1st column that fulfile the if condition). i nead somthing like if jth of unique.txt == ith of total.txt then write ith row of total.txt into new file.
import csv
with open('unique.txt') as uniques, open('total.txt') as total:
uniques = list(tuple(line) for line in csv.reader(uniques))
totals = {}
for line in csv.reader(total):
totals[tuple(line[1:])] = line
with open('output.txt', 'w') as outfile:
writer = csv.writer(outfile)
for line in uniques:
writer.writerow(totals.get(line, []))
I will write your code in this way:
file=open('total.txt')
list_file = list(file)
file2 = open('unique.txt')
list_file2 = list(file2)
store_id_lines = []
ff = open('match.dat', 'w')
for curr_line_total in list_file:
line=curr_line_total.split('\t')
seq=line[1]+'\t'+ line[2]
if seq in list_file2:
ff.write(curr_line_total)
Please, avoid readlines() and use the with syntax when you open your files.
Here is explained why you don't need to use readlines()