Insert a line between existing lines in a CSV file using python - python

I am creating a script that writes lines to a CSV file using Python.
For now, my script writes the CSV in this format:
Title row
Value1;Value2;.... (more than 70)
Title row2
Value1;Value2;...
I just want to be able to read the file again and insert a line of values in between rows, like the following:
Title row
Value1;Value2;.... (more than 70)
Value1;Value2;....
Title row2
Value1;Value2;...
Do you have any ideas?

import csv
with open('csvfile.csv', mode='w') as csv_file:
fieldnames = ['Title', 'row']
writer = csv.DictWriter(csv_file, delimiter=';',fieldnames=fieldnames)
writer.writeheader()
writer.writerow({'Title': 'Value1', 'row': 'Value2'})

I think you can try getting the index of the row with Header Titles and then use append to add new rows and then again combine the two dataframes. Here is the code that might work for you.
import pandas as pd
# intialise data of lists.
data = {'Title':['Tom', 'nick', 'krish', 'jack','Title','Harry'],
'Row':[20, 21, 19, 18,'Row',21]}
new_data = {'Title':['Rahib'],
'Row':[25]}
# Create DataFrame
df = pd.DataFrame(data)
new_df = pd.DataFrame(new_data)
#print(df)
index = df[df['Title'] == 'Title'].index.values.astype(int)[0]
upper_df = df.loc[:index-1]
lower_df = df.loc[index+1:]
upper_df = upper_df.append(new_df)
upper_df = upper_df.append(lower_df).reset_index(drop=True)
print(upper_df)
This will return following dataframe.
Title Row
0 Tom 20
1 nick 21
2 krish 19
3 jack 18
4 Rahib 25
5 Harry 21

Thanks to #harshal's answer and #Rahib's answer, but I've already tried pandas for a long time, and can't seem to get it to work, looks like it's not really suited for my CSV format.
Finally, I've looked at the posts provided by #Serge Ballesta, and in fact, a simple readlines and the retrieval of the line index is a pretty simple trick:
with open(output) as myFile:
for num, line in enumerate(myFile, 1):
if lookup in line:
index = num
f = open(output, "r")
contents = f.readlines()
value=';'.join(value)
f.close()
contents.insert(index+1, str(value)+'\n')
f = open(output, "w")
contents = "".join(contents)
f.write(contents)
f.close()
With output being the name of the file (passed in parameters), value being a list of values (joined for a string with ";" as delimiter), and lookup being the string i was looking for (the title row)

Related

Read CSV file with quotechar-comma combination in string - Python

I have got multiple csv files which look like this:
ID,Text,Value
1,"I play football",10
2,"I am hungry",12
3,"Unfortunately",I get an error",15
I am currently importing the data using the pandas read_csv() function.
df = pd.read_csv(filename, sep = ',', quotechar='"')
This works for the first two rows in my csv file, unfortunately I get an error in row 3. The reason is that within the 'Text' column there is a quotechar character-comma combination before the end of the column.
ParserError: Error tokenizing data. C error: Expected 3 fields in line 4, saw 4
Is there a way to solve this issue?
Expected output:
ID Text Value
1 I play football 10
2 I am hungry 12
3 Unfortunately, I get an error 15
You can try to fix the CSV using re module:
import re
import pandas as pd
from io import StringIO
with open("your_file.csv", "r") as f_in:
s = re.sub(
r'"(.*)"',
lambda g: '"' + g.group(1).replace('"', "\\") + '"',
f_in.read(),
)
df = pd.read_csv(StringIO(s), sep=r",", quotechar='"', escapechar="\\")
print(df)
Prints:
ID Text Value
0 1 I play football 10
1 2 I am hungry 12
2 3 Unfortunately,I get an error 15
One (not so flexible) approach would be to firstly remove all " quotes from the csv, and then enclose the elements of the specific column with "" quotes(this is done to avoid misinterpreting the "," seperator while parsing), like this:
import csv
# Specify the column index (0-based)
column_index = 1
# Open the input CSV file
with open('input.csv', 'r') as f:
reader = csv.reader(f)
# Open the output CSV file
with open('output.csv', 'w', newline='') as g:
writer = csv.writer(g)
# Iterate through the rows of the input CSV file
for row in reader:
# Replace the " character with an empty string
row[column_index] = row[column_index].replace('"', '')
# Enclose the modified element in "" quotes
row[column_index] = f'"{row[column_index]}"'
# Write the modified row to the output CSV file
writer.writerow(row)
This code creates a new modified csv file
Then your problematic csv row will look like that:
3,"Unfortunately,I get an error",15"
Then you can import the data like you did: df = pd.read_csv(filename, sep = ',', quotechar='"')
To automate this conversion for all csv files within a directory:
import csv
import glob
# Specify the column index (0-based)
column_index = 1
# Get a list of all CSV files in the current directory
csv_files = glob.glob('*.csv')
# Iterate through the CSV files
for csv_file in csv_files:
# Open the input CSV file
with open(csv_file, 'r') as f:
reader = csv.reader(f)
# Open the output CSV file
output_file = csv_file.replace('.csv', '_new.csv')
with open(output_file, 'w', newline='') as g:
writer = csv.writer(g)
# Iterate through the rows of the input CSV file
for row in reader:
# Replace the " character with an empty string
row[column_index] = row[column_index].replace('"', '')
# Enclose the modified element in "" quotes
row[column_index] = f'"{row[column_index]}"'
# Write the modified row to the output CSV file
writer.writerow(row)
this names the new csv files as the old ones but with "_new.csv" instead of just ".csv".
A possible solution:
df = pd.read_csv(filename, sep='(?<=\d),|,(?=\d)', engine='python')
df = df.reset_index().set_axis(['ID', 'Text', 'Value'], axis=1)
df['Text'] = df['Text'].replace('\"', '', regex=True)
Another possible solution:
df = pd.read_csv(StringIO(text), sep='\t')
df[['ID', 'Text']] = df.iloc[:, 0].str.split(',', expand=True, n=1)
df[['Text', 'Value']] = df['Text'].str.rsplit(',', expand=True, n=1)
df = df.drop(df.columns[0], axis=1).assign(
Text=df['Text'].replace('\"', '', regex=True))
Output:
ID Text Value
0 1 I play football 10
1 2 I am hungry 12
2 3 Unfortunately,I get an error 15

Split values in CSV that look like JSON

So I have a CSV file with a column called content. However, the contents in column look like it is based on JSON, and, therefore, house more columns. I would like to split these contents into multiple columns or extract the final part of it after "value". See picture below to see an example of the file. Any ideas how to get this? I would prefer using Python. I don't have any experience with JSON.
Using pandas you could do in a simpler way.
EDIT updated to handle the single quotes:
import pandas as pd
import json
data = pd.read_csv('test.csv', delimiter="\n")["content"]
res = [json.loads(row.replace("'", '"')) for row in data]
result = pd.DataFrame(res)
result.head()
# Export result to CSV
result.to_csv("result.csv")
my csv:
result:
This script will create a new csv file with the 'value' added to the csv as an additional column
(make sure that the input_csv and output_csv are different filenames)
import csv
import json
input_csv = "data.csv"
output_csv = "data_updated.csv"
values = []
with open(input_csv) as f_in:
dr = csv.DictReader(f_in)
for row in dr:
value = json.loads(row["content"].replace("'", '"'))["value"]
values.append(value)
with open(input_csv) as f_in:
with open(output_csv, "w+") as f_out:
w = csv.writer(f_out, lineterminator="\n")
r = csv.reader(f_in)
all = []
row = next(r)
row.append("value")
all.append(row)
i = 0
for row in r:
row.append(values[i])
all.append(row)
i += 1
w.writerows(all)

I need to read a csv with unknown number of columns and then write the data to a csv with a set number of columns

So I have a file that looks like this:
name,number,email,job1,job2,job3,job4
I need to convert it to one that looks like this:
name,number,email,job1
name,number,email,job2
name,number,email,job3
name,number,email,job4
How would I do this in Python?
As said in a comment that you can use pandas to read, write and manipulate csv file.
Here is one example of how you can solve your problem with pandas in python
import pandas as pd
# df = pd.read_csv("filename.csv") # read csv file from disk
# comment out below line when open from disk
df = pd.DataFrame([['ss','0152','ss#','student','others']],columns=['name','number','email','job1','job2'])
print(df)
this line output is
name number email job1 job2
0 ss 0152 ss# student others
Now we need to know how many columns are there:
x = len(df.columns)
print(x)
it will store the number of column in x
5
Now let's create a empty Dataframe with columns= [name,number,email,job]
c = pd.DataFrame(columns=['name','number','email','job'])
print(c)
output:
Columns: [name, number, email, job]
Index: []
Now we use loop from range 3 to end of the column and concat datafarme with our empty dataframe:
for i in range(3,x):
df1 = df.iloc[:,0:3].copy() # we took first 3 column
df2 = df.iloc[:,[i]].copy() # we took ith coulmn
df1['job'] = df2; # added ith coulmn to the df1
c = pd.concat([df1,c]); # concat df1 and c
print(c)
output:
name number email job
0 ss 0152 ss# others
0 ss 0152 ss# student
Dataframe c has your desired output. Now you can save it using
c.to_csv('ouput.csv')
Let's assume this is the dataframe:
import pandas as pd
df = pd.DataFrame(columns=['name','number','email','job1','job2','job3','job4'])
df = df.append({'name':'jon', 'number':123, 'email':'smth#smth.smth', 'job1':'a','job2':'b','job3':'c','job4':'d'},ignore_index=True)
We define a new dataframe:
new_df = pd.DataFrame(columns=['name','number','email','job'])
Now, we loop over the old one to split it based on the jobs. I assume you have 4 jobs to split:
for i, row in df.iterrows():
for job in range(1,5):
job_col = "job" + str(job)
new_df = new_df.append({'name':row['name'], 'number':row['number'], 'email':row['email'], 'job':row[job_col]}, ignore_index=True)
You can use the csv module and Python's unpacking syntax to get the data from the input file and write it to the output file.
import csv
with open('input.csv', newline='') as infile, open('output.csv', 'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
# Skip header row, if necessary
next(reader)
# Use sequence unpacking to get the fixed variables and
# and arbitrary number of "jobs".
for name, number, email, *jobs in reader:
for job in jobs:
writer.writerow([name, number, email, job])
Below:
with open('input.csv') as f_in:
lines = [l.strip() for l in f_in.readlines()]
with open('output.csv','w') as f_out:
for idx,line in enumerate(lines):
if idx > 0:
fields = line.split(',')
for idx in range(3,len(fields)):
f_out.write(','.join(fields[:3]) + ',' + fields[idx] + '\n')
input.csv
header row
name,number,email,job1,job2,job3,job4
name1,number1,email1,job11,job21,job31,job41
output.csv
name,number,email,job1
name,number,email,job2
name,number,email,job3
name,number,email,job4
name1,number1,email1,job11
name1,number1,email1,job21
name1,number1,email1,job31
name1,number1,email1,job41

Python: extracting data values from one file with IDs from a second file

I’m new to coding, and trying to extract a subset of data from a large file.
File_1 contains the data in two columns: ID and Values.
File_2 contains a large list of IDs, some of which may be present in File_1 while others will not be present.
If an ID from File_2 is present in File_1, I would like to extract those values and write the ID and value to a new file, but I’m not sure how to do this. Here is an example of the files:
File_1: data.csv
ID Values
HOT224_1_0025m_c100047_1 16
HOT224_1_0025m_c10004_1 3
HOT224_1_0025m_c100061_1 1
HOT224_1_0025m_c10010_2 1
HOT224_1_0025m_c10020_1 1
File_2: ID.xlsx
IDs
HOT224_1_0025m_c100047_1
HOT224_1_0025m_c100061_1
HOT225_1_0025m_c100547_1
HOT225_1_0025m_c100561_1
I tried the following:
import pandas as pd
data_file = pd.read_csv('data.csv', index_col = 0)
ID_file = pd.read_excel('ID.xlsx')
values_from_ID = data_file.loc[['ID_file']]
The following error occurs:
KeyError: "None of [['ID_file']] are in the [index]"
Not sure if I am reading in the excel file correctly.
I also do not know how to write the extracted data to a new file once I get the code to do it.
Thanks for your help.
With pandas:
import pandas as pd
data_file = pd.read_csv('data.csv', index_col=0, delim_whitespace=True)
ID_file = pd.read_excel('ID.xlsx', index_col=0)
res = data_file.loc[ID_file.index].dropna()
res.to_csv('result.csv')
Content of result.csv:
IDs,Values
HOT224_1_0025m_c100047_1,16.0
HOT224_1_0025m_c100061_1,1.0
In steps:
You need to read your csv with whitespace delimited:
data_file = pd.read_csv('data.csv', index_col=0, delim_whitespace=True)
it looks like this:
>>> data_file
Values
ID
HOT224_1_0025m_c100047_1 16
HOT224_1_0025m_c10004_1 3
HOT224_1_0025m_c100061_1 1
HOT224_1_0025m_c10010_2 1
HOT224_1_0025m_c10020_1 1
Now, read your Excel file, using the ids as index:
ID_file = pd.read_excel('ID.xlsx', index_col=0)
and you use its index with locto get the matching entries from your first dataframe. Drop the missing values with dropna():
res = data_file.loc[ID_file.index].dropna()
Finally, write to the result csv:
res.to_csv('result.csv')
You can do it using a simple dictionary in Python. You can make a dictionary from file 1 and read the IDs from File 2. The IDS from file 2 can be checked in the dictionary and only the matching ones can be written to your output file. Something like this could work :
with open('data.csv','r') as f:
lines = f.readlines()
#Skip the CSV Header
lines = lines[1:]
table = {l.split()[0]:l.split()[1] for l in lines if len(l.strip()) != 0}
with open('id.csv','r') as f:
lines = f.readlines()
#Skip the CSV Header
lines = lines[1:]
matchedIDs = [(l.strip(),table[l.strip()]) for l in line if l.strip() in table]
Now you will have your matched IDs and their values in a list of tuples called matchedIDs. You can write them in any format you like in a file.
I'm also new to python programming. So the code that I used below might not be the most efficient. The situation I assumed is that find ids in data.csv also in id.csv, there might be some ids in data.csv not in id.csv and vise versa.
import pandas as pd
data = pd.read_csv('data.csv')
id2 = pd.read_csv('id.csv')
data.ID = data['ID']
id2.ID = idd['IDs']
d=[]
for row in data.ID:
d.append(row)
f=[]
for row in id2.ID:
f.append(row)
g=[]
for i in d:
if i in f:
g.append(i)
data = pd.read_csv('data.csv',index_col='ID')
new_data = data.loc[g,:]
new_data.to_csv('new_data.csv')
This is the code I ended up using. It worked perfectly. Thanks to everyone for their responses.
import pandas as pd
data_file = pd.read_csv('data.csv', index_col=0)
ID_file = pd.read_excel('ID.xlsx', index_col=0)
res = data_file.loc[ID_file.index].dropna()
res.to_csv('result.csv')

How to perform a simple calculation in a CSV and append the results to the file

I have a csv which contains 38 colums of data, all I want to find our how to do is, divide column 11 by column by column 38 and append this data tot he end of each row. Missing out the title row of the csv (row 1.)
If I am able to get a snippet of code that can do this, I will be able to manipulate the same code to perform lots of similar functions.
My attempt involved editing some code that was designed for something else.
See below:
from collections import defaultdict
class_col = 11
data_col = 38
# Read in the data
with open('test.csv', 'r') as f:
# if you have a header on the file
# header = f.readline().strip().split(',')
data = [line.strip().split(',') for line in f]
# Append the relevant sum to the end of each row
for row in xrange(len(data)):
data[row].append(int(class_col)/int(data_col))
# Write the results to a new csv file
with open('testMODIFIED2.csv', 'w') as nf:
nf.write('\n'.join(','.join(row) for row in data))
Any help will be greatly appreciated. Thanks SMNALLY
import csv
with open('test.csv', 'rb') as old_csv:
csv_reader = csv.reader(old_csv)
with open('testMODIFIED2.csv', 'wb') as new_csv:
csv_writer = csv.writer(new_csv)
for i, row in enumerate(csv_reader):
if i != 0:
row.append(float(row[10]) / float(row[37]))
csv_writer.writerow(row)
Use pandas:
import pandas
df = pandas.read_csv('test.csv') #assumes header row exists
df['FRACTION'] = 1.0*df['CLASS']/df['DATA'] #by default new columns are appended to the end
df.to_csv('out.csv')

Categories

Resources