Read CSV file with quotechar-comma combination in string - Python - python

I have got multiple csv files which look like this:
ID,Text,Value
1,"I play football",10
2,"I am hungry",12
3,"Unfortunately",I get an error",15
I am currently importing the data using the pandas read_csv() function.
df = pd.read_csv(filename, sep = ',', quotechar='"')
This works for the first two rows in my csv file, unfortunately I get an error in row 3. The reason is that within the 'Text' column there is a quotechar character-comma combination before the end of the column.
ParserError: Error tokenizing data. C error: Expected 3 fields in line 4, saw 4
Is there a way to solve this issue?
Expected output:
ID Text Value
1 I play football 10
2 I am hungry 12
3 Unfortunately, I get an error 15

You can try to fix the CSV using re module:
import re
import pandas as pd
from io import StringIO
with open("your_file.csv", "r") as f_in:
s = re.sub(
r'"(.*)"',
lambda g: '"' + g.group(1).replace('"', "\\") + '"',
f_in.read(),
)
df = pd.read_csv(StringIO(s), sep=r",", quotechar='"', escapechar="\\")
print(df)
Prints:
ID Text Value
0 1 I play football 10
1 2 I am hungry 12
2 3 Unfortunately,I get an error 15

One (not so flexible) approach would be to firstly remove all " quotes from the csv, and then enclose the elements of the specific column with "" quotes(this is done to avoid misinterpreting the "," seperator while parsing), like this:
import csv
# Specify the column index (0-based)
column_index = 1
# Open the input CSV file
with open('input.csv', 'r') as f:
reader = csv.reader(f)
# Open the output CSV file
with open('output.csv', 'w', newline='') as g:
writer = csv.writer(g)
# Iterate through the rows of the input CSV file
for row in reader:
# Replace the " character with an empty string
row[column_index] = row[column_index].replace('"', '')
# Enclose the modified element in "" quotes
row[column_index] = f'"{row[column_index]}"'
# Write the modified row to the output CSV file
writer.writerow(row)
This code creates a new modified csv file
Then your problematic csv row will look like that:
3,"Unfortunately,I get an error",15"
Then you can import the data like you did: df = pd.read_csv(filename, sep = ',', quotechar='"')
To automate this conversion for all csv files within a directory:
import csv
import glob
# Specify the column index (0-based)
column_index = 1
# Get a list of all CSV files in the current directory
csv_files = glob.glob('*.csv')
# Iterate through the CSV files
for csv_file in csv_files:
# Open the input CSV file
with open(csv_file, 'r') as f:
reader = csv.reader(f)
# Open the output CSV file
output_file = csv_file.replace('.csv', '_new.csv')
with open(output_file, 'w', newline='') as g:
writer = csv.writer(g)
# Iterate through the rows of the input CSV file
for row in reader:
# Replace the " character with an empty string
row[column_index] = row[column_index].replace('"', '')
# Enclose the modified element in "" quotes
row[column_index] = f'"{row[column_index]}"'
# Write the modified row to the output CSV file
writer.writerow(row)
this names the new csv files as the old ones but with "_new.csv" instead of just ".csv".

A possible solution:
df = pd.read_csv(filename, sep='(?<=\d),|,(?=\d)', engine='python')
df = df.reset_index().set_axis(['ID', 'Text', 'Value'], axis=1)
df['Text'] = df['Text'].replace('\"', '', regex=True)
Another possible solution:
df = pd.read_csv(StringIO(text), sep='\t')
df[['ID', 'Text']] = df.iloc[:, 0].str.split(',', expand=True, n=1)
df[['Text', 'Value']] = df['Text'].str.rsplit(',', expand=True, n=1)
df = df.drop(df.columns[0], axis=1).assign(
Text=df['Text'].replace('\"', '', regex=True))
Output:
ID Text Value
0 1 I play football 10
1 2 I am hungry 12
2 3 Unfortunately,I get an error 15

Related

Split CSV into multiple files based on column value

I have a poorly-structured CSV file named file.csv, and I want to split it up into multiple CSV using Python.
|A|B|C|
|Continent||1|
|Family|44950|file1|
|Species|44950|12|
|Habitat||4|
|Species|44950|22|
|Condition|Tue Jan 24 00:00:00 UTC 2023|4|
|Family|Fish|file2|
|Species|Bass|8|
|Species|Trout|2|
|Habitat|River|3|
The new files need to be separated based on everything between the Family rows, so for example:
file1.csv
|A|B|C|
|Continent||1|
|Family|44950|file1|
|Species|44950|12|
|Habitat||4|
|Species|44950|22|
|Condition|Tue Jan 24 00:00:00 UTC 2023|4|
file2.csv
|A|B|C|
|Continent||1|
|Family|Fish|file2|
|Species|Bass|8|
|Species|Trout|2|
|Habitat|River|3|
What's the best way of achieving this when the number of rows between appearances of Species is not consistent?
If your file really looks like that ;) then you could use groupby from the standard library module itertools:
from itertools import groupby
def key(line): return line.startswith("|Family|")
family_line, file_no = None, 0
with open("file.csv", "r") as fin:
for is_family_line, lines in groupby(fin, key=key):
if is_family_line:
family_line = list(lines).pop()
elif family_line is None:
header = "".join(lines)
else:
file_no += 1
with open(f"file{file_no}.csv", "w") as fout:
fout.write(header + family_line)
for line in lines:
fout.write(line)
A Pandas solution would be:
import pandas as pd
df = pd.read_csv("file.csv", header=None, delimiter="|").fillna("")
blocks = df.iloc[:, 1].eq("Family").cumsum()
header_df = df[blocks.eq(0)]
for no, sdf in df.groupby(blocks):
if no > 0:
sdf = pd.concat([header_df, sdf])
sdf.to_csv(f"file{no}.csv", index=False, header=False, sep="|")
import pandas as pd
pd.read_csv('file.csv',delimiter='|')
groups = df.groupby('Family')
for name, group in groups:
    group.to_csv(name + '.csv', index=False)
Here is a pure python working method:
# Read file
with open('file.csv', 'r') as file:
text = file.read()
# Split using |Family|
splitted_text = text.split("|Family|")
# Remove unwanted content before first |Family|
splitted_text = splitted_text[1:]
# Add |Family| back to each part
splitted_text = ['|Family|' + item for item in splitted_text]
# Write files
for i, content in enumerate(splitted_text ):
with open('file{}.csv'.format(i), 'w') as file:
file.write(content)

Create csv file using python, where all the values are seperated after first spacing and creates one column

I need help to convert simple_line.txt file to csv file using the pandas library. However, I am unable to categorize image file where i want to create all the values after first space in one column.
Here is the file (sample_list.txt), listed row by row:
Image Label
doc_pres223.jpg Durasal
doc_pres224.jpg Tab Cefepime
doc_pres225.jpg Tab Bleomycin
doc_pres226.jpg Budesonide is a corticosteroid,
doc_pres227.jpg prescribed for inflammatory,
I want the csv file to be like-
enter image description here
txt_file = r"./example.txt"
csv_file = r"./example.csv"
separator = "; "
with open(txt_file) as f_in, open(csv_file, "w+") as f_out:
for line in f_in:
f_out.write(separator.join(line.split(" ", maxsplit=1)))
try this:
import pandas as pd
def write_file(filename, output):
df = pd.DataFrame()
lines = open(filename, 'r').readlines()
for l in range(1, len(lines)):
line = lines[l]
arr = line.split(" ", maxsplit=1)
image_line = arr[0]
label_line = arr[1].replace('\n', '')
df = df.append({'Image': image_line, 'Label': label_line}, ignore_index=True)
df.to_csv(output)
if __name__ == '__main__':
write_file('example.txt', 'example.csv')
If the filenames in column Image is always the same length, then you could just treat is as a fixed width file. So the first column would be 15 characters, and the rest is the second column. Then just add two empty columns and write it to a new file.
# libraries
import pandas as pd
# set filename
filename = "simple_line.txt"
# read as fixed width
df = pd.read_fwf(filename, header=0, widths=[15, 100])
# add 2 empty columns
df.insert(1, 'empty1', '')
df.insert(2, 'empty2', '')
# save as a new csv file
filenew = "output.csv"
df.to_csv(filenew, sep=';', header=True, index=False)

Split values in CSV that look like JSON

So I have a CSV file with a column called content. However, the contents in column look like it is based on JSON, and, therefore, house more columns. I would like to split these contents into multiple columns or extract the final part of it after "value". See picture below to see an example of the file. Any ideas how to get this? I would prefer using Python. I don't have any experience with JSON.
Using pandas you could do in a simpler way.
EDIT updated to handle the single quotes:
import pandas as pd
import json
data = pd.read_csv('test.csv', delimiter="\n")["content"]
res = [json.loads(row.replace("'", '"')) for row in data]
result = pd.DataFrame(res)
result.head()
# Export result to CSV
result.to_csv("result.csv")
my csv:
result:
This script will create a new csv file with the 'value' added to the csv as an additional column
(make sure that the input_csv and output_csv are different filenames)
import csv
import json
input_csv = "data.csv"
output_csv = "data_updated.csv"
values = []
with open(input_csv) as f_in:
dr = csv.DictReader(f_in)
for row in dr:
value = json.loads(row["content"].replace("'", '"'))["value"]
values.append(value)
with open(input_csv) as f_in:
with open(output_csv, "w+") as f_out:
w = csv.writer(f_out, lineterminator="\n")
r = csv.reader(f_in)
all = []
row = next(r)
row.append("value")
all.append(row)
i = 0
for row in r:
row.append(values[i])
all.append(row)
i += 1
w.writerows(all)

I need to read a csv with unknown number of columns and then write the data to a csv with a set number of columns

So I have a file that looks like this:
name,number,email,job1,job2,job3,job4
I need to convert it to one that looks like this:
name,number,email,job1
name,number,email,job2
name,number,email,job3
name,number,email,job4
How would I do this in Python?
As said in a comment that you can use pandas to read, write and manipulate csv file.
Here is one example of how you can solve your problem with pandas in python
import pandas as pd
# df = pd.read_csv("filename.csv") # read csv file from disk
# comment out below line when open from disk
df = pd.DataFrame([['ss','0152','ss#','student','others']],columns=['name','number','email','job1','job2'])
print(df)
this line output is
name number email job1 job2
0 ss 0152 ss# student others
Now we need to know how many columns are there:
x = len(df.columns)
print(x)
it will store the number of column in x
5
Now let's create a empty Dataframe with columns= [name,number,email,job]
c = pd.DataFrame(columns=['name','number','email','job'])
print(c)
output:
Columns: [name, number, email, job]
Index: []
Now we use loop from range 3 to end of the column and concat datafarme with our empty dataframe:
for i in range(3,x):
df1 = df.iloc[:,0:3].copy() # we took first 3 column
df2 = df.iloc[:,[i]].copy() # we took ith coulmn
df1['job'] = df2; # added ith coulmn to the df1
c = pd.concat([df1,c]); # concat df1 and c
print(c)
output:
name number email job
0 ss 0152 ss# others
0 ss 0152 ss# student
Dataframe c has your desired output. Now you can save it using
c.to_csv('ouput.csv')
Let's assume this is the dataframe:
import pandas as pd
df = pd.DataFrame(columns=['name','number','email','job1','job2','job3','job4'])
df = df.append({'name':'jon', 'number':123, 'email':'smth#smth.smth', 'job1':'a','job2':'b','job3':'c','job4':'d'},ignore_index=True)
We define a new dataframe:
new_df = pd.DataFrame(columns=['name','number','email','job'])
Now, we loop over the old one to split it based on the jobs. I assume you have 4 jobs to split:
for i, row in df.iterrows():
for job in range(1,5):
job_col = "job" + str(job)
new_df = new_df.append({'name':row['name'], 'number':row['number'], 'email':row['email'], 'job':row[job_col]}, ignore_index=True)
You can use the csv module and Python's unpacking syntax to get the data from the input file and write it to the output file.
import csv
with open('input.csv', newline='') as infile, open('output.csv', 'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
# Skip header row, if necessary
next(reader)
# Use sequence unpacking to get the fixed variables and
# and arbitrary number of "jobs".
for name, number, email, *jobs in reader:
for job in jobs:
writer.writerow([name, number, email, job])
Below:
with open('input.csv') as f_in:
lines = [l.strip() for l in f_in.readlines()]
with open('output.csv','w') as f_out:
for idx,line in enumerate(lines):
if idx > 0:
fields = line.split(',')
for idx in range(3,len(fields)):
f_out.write(','.join(fields[:3]) + ',' + fields[idx] + '\n')
input.csv
header row
name,number,email,job1,job2,job3,job4
name1,number1,email1,job11,job21,job31,job41
output.csv
name,number,email,job1
name,number,email,job2
name,number,email,job3
name,number,email,job4
name1,number1,email1,job11
name1,number1,email1,job21
name1,number1,email1,job31
name1,number1,email1,job41

removing a column from a CSV file in python

I am attempting to remove a duplicate column from a csv file in Python3. I am able to get the code to run without error...however when I attempt to print the number of rows processed I receive a blank response in codio.
The response to running comes back as 1001codio#sigma-portal:~/workspace$
Can anyone point out how I can fix this? Ideally I would like it to print how many columns were deleted.
import csv
import re
data = []
import csv
input_file = 'customerdata.csv'
output_file = 'outputcustomerdata.csv'
cols_to_remove = [11] # Column indexes to be removed (starts at 0)
cols_to_remove = sorted(cols_to_remove, reverse=True) # Reverse so we remove from the end first
row_count = 0 # Current amount of rows processed
with open(input_file, "r") as source:
reader = csv.reader(source)
with open(output_file, "wt") as result:
writer = csv.writer(result)
for row in reader:
row_count += 1
print('\r{0}'.format(row_count), end='') # Print rows processed
for col_index in cols_to_remove:
del row[col_index]
writer.writerow(row)
Try pandas
import pandas as pd
df = pd.read_csv('customerdata.csv')
df = df.drop('column_name', axis=1)
# save your file
df.to_csv('outputcustomerdata.csv', encoding='utf-8')
print(df)

Categories

Resources