Split CSV into multiple files based on column value - python

I have a poorly-structured CSV file named file.csv, and I want to split it up into multiple CSV using Python.
|A|B|C|
|Continent||1|
|Family|44950|file1|
|Species|44950|12|
|Habitat||4|
|Species|44950|22|
|Condition|Tue Jan 24 00:00:00 UTC 2023|4|
|Family|Fish|file2|
|Species|Bass|8|
|Species|Trout|2|
|Habitat|River|3|
The new files need to be separated based on everything between the Family rows, so for example:
file1.csv
|A|B|C|
|Continent||1|
|Family|44950|file1|
|Species|44950|12|
|Habitat||4|
|Species|44950|22|
|Condition|Tue Jan 24 00:00:00 UTC 2023|4|
file2.csv
|A|B|C|
|Continent||1|
|Family|Fish|file2|
|Species|Bass|8|
|Species|Trout|2|
|Habitat|River|3|
What's the best way of achieving this when the number of rows between appearances of Species is not consistent?

If your file really looks like that ;) then you could use groupby from the standard library module itertools:
from itertools import groupby
def key(line): return line.startswith("|Family|")
family_line, file_no = None, 0
with open("file.csv", "r") as fin:
for is_family_line, lines in groupby(fin, key=key):
if is_family_line:
family_line = list(lines).pop()
elif family_line is None:
header = "".join(lines)
else:
file_no += 1
with open(f"file{file_no}.csv", "w") as fout:
fout.write(header + family_line)
for line in lines:
fout.write(line)
A Pandas solution would be:
import pandas as pd
df = pd.read_csv("file.csv", header=None, delimiter="|").fillna("")
blocks = df.iloc[:, 1].eq("Family").cumsum()
header_df = df[blocks.eq(0)]
for no, sdf in df.groupby(blocks):
if no > 0:
sdf = pd.concat([header_df, sdf])
sdf.to_csv(f"file{no}.csv", index=False, header=False, sep="|")

import pandas as pd
pd.read_csv('file.csv',delimiter='|')
groups = df.groupby('Family')
for name, group in groups:
    group.to_csv(name + '.csv', index=False)

Here is a pure python working method:
# Read file
with open('file.csv', 'r') as file:
text = file.read()
# Split using |Family|
splitted_text = text.split("|Family|")
# Remove unwanted content before first |Family|
splitted_text = splitted_text[1:]
# Add |Family| back to each part
splitted_text = ['|Family|' + item for item in splitted_text]
# Write files
for i, content in enumerate(splitted_text ):
with open('file{}.csv'.format(i), 'w') as file:
file.write(content)

Related

Read CSV file with quotechar-comma combination in string - Python

I have got multiple csv files which look like this:
ID,Text,Value
1,"I play football",10
2,"I am hungry",12
3,"Unfortunately",I get an error",15
I am currently importing the data using the pandas read_csv() function.
df = pd.read_csv(filename, sep = ',', quotechar='"')
This works for the first two rows in my csv file, unfortunately I get an error in row 3. The reason is that within the 'Text' column there is a quotechar character-comma combination before the end of the column.
ParserError: Error tokenizing data. C error: Expected 3 fields in line 4, saw 4
Is there a way to solve this issue?
Expected output:
ID Text Value
1 I play football 10
2 I am hungry 12
3 Unfortunately, I get an error 15
You can try to fix the CSV using re module:
import re
import pandas as pd
from io import StringIO
with open("your_file.csv", "r") as f_in:
s = re.sub(
r'"(.*)"',
lambda g: '"' + g.group(1).replace('"', "\\") + '"',
f_in.read(),
)
df = pd.read_csv(StringIO(s), sep=r",", quotechar='"', escapechar="\\")
print(df)
Prints:
ID Text Value
0 1 I play football 10
1 2 I am hungry 12
2 3 Unfortunately,I get an error 15
One (not so flexible) approach would be to firstly remove all " quotes from the csv, and then enclose the elements of the specific column with "" quotes(this is done to avoid misinterpreting the "," seperator while parsing), like this:
import csv
# Specify the column index (0-based)
column_index = 1
# Open the input CSV file
with open('input.csv', 'r') as f:
reader = csv.reader(f)
# Open the output CSV file
with open('output.csv', 'w', newline='') as g:
writer = csv.writer(g)
# Iterate through the rows of the input CSV file
for row in reader:
# Replace the " character with an empty string
row[column_index] = row[column_index].replace('"', '')
# Enclose the modified element in "" quotes
row[column_index] = f'"{row[column_index]}"'
# Write the modified row to the output CSV file
writer.writerow(row)
This code creates a new modified csv file
Then your problematic csv row will look like that:
3,"Unfortunately,I get an error",15"
Then you can import the data like you did: df = pd.read_csv(filename, sep = ',', quotechar='"')
To automate this conversion for all csv files within a directory:
import csv
import glob
# Specify the column index (0-based)
column_index = 1
# Get a list of all CSV files in the current directory
csv_files = glob.glob('*.csv')
# Iterate through the CSV files
for csv_file in csv_files:
# Open the input CSV file
with open(csv_file, 'r') as f:
reader = csv.reader(f)
# Open the output CSV file
output_file = csv_file.replace('.csv', '_new.csv')
with open(output_file, 'w', newline='') as g:
writer = csv.writer(g)
# Iterate through the rows of the input CSV file
for row in reader:
# Replace the " character with an empty string
row[column_index] = row[column_index].replace('"', '')
# Enclose the modified element in "" quotes
row[column_index] = f'"{row[column_index]}"'
# Write the modified row to the output CSV file
writer.writerow(row)
this names the new csv files as the old ones but with "_new.csv" instead of just ".csv".
A possible solution:
df = pd.read_csv(filename, sep='(?<=\d),|,(?=\d)', engine='python')
df = df.reset_index().set_axis(['ID', 'Text', 'Value'], axis=1)
df['Text'] = df['Text'].replace('\"', '', regex=True)
Another possible solution:
df = pd.read_csv(StringIO(text), sep='\t')
df[['ID', 'Text']] = df.iloc[:, 0].str.split(',', expand=True, n=1)
df[['Text', 'Value']] = df['Text'].str.rsplit(',', expand=True, n=1)
df = df.drop(df.columns[0], axis=1).assign(
Text=df['Text'].replace('\"', '', regex=True))
Output:
ID Text Value
0 1 I play football 10
1 2 I am hungry 12
2 3 Unfortunately,I get an error 15

Create csv file using python, where all the values are seperated after first spacing and creates one column

I need help to convert simple_line.txt file to csv file using the pandas library. However, I am unable to categorize image file where i want to create all the values after first space in one column.
Here is the file (sample_list.txt), listed row by row:
Image Label
doc_pres223.jpg Durasal
doc_pres224.jpg Tab Cefepime
doc_pres225.jpg Tab Bleomycin
doc_pres226.jpg Budesonide is a corticosteroid,
doc_pres227.jpg prescribed for inflammatory,
I want the csv file to be like-
enter image description here
txt_file = r"./example.txt"
csv_file = r"./example.csv"
separator = "; "
with open(txt_file) as f_in, open(csv_file, "w+") as f_out:
for line in f_in:
f_out.write(separator.join(line.split(" ", maxsplit=1)))
try this:
import pandas as pd
def write_file(filename, output):
df = pd.DataFrame()
lines = open(filename, 'r').readlines()
for l in range(1, len(lines)):
line = lines[l]
arr = line.split(" ", maxsplit=1)
image_line = arr[0]
label_line = arr[1].replace('\n', '')
df = df.append({'Image': image_line, 'Label': label_line}, ignore_index=True)
df.to_csv(output)
if __name__ == '__main__':
write_file('example.txt', 'example.csv')
If the filenames in column Image is always the same length, then you could just treat is as a fixed width file. So the first column would be 15 characters, and the rest is the second column. Then just add two empty columns and write it to a new file.
# libraries
import pandas as pd
# set filename
filename = "simple_line.txt"
# read as fixed width
df = pd.read_fwf(filename, header=0, widths=[15, 100])
# add 2 empty columns
df.insert(1, 'empty1', '')
df.insert(2, 'empty2', '')
# save as a new csv file
filenew = "output.csv"
df.to_csv(filenew, sep=';', header=True, index=False)

Compare 2 csv files and remove the common lines from 1st file | python

I want to compare 2 csv files master.csv and exclude.csv and remove all the matching lines based on column1 and write the final output in mater.csv file.
master.csv
abc,xyz
cde,fgh
ijk,lmn
exclude.csv
###Exclude list###
cde
####
Expected output (it should overwrite master.csv
abc,xyz
ijk,lmn
Tried till now
with open('exclude.csv','r') as in_file, open('master.csv','w') as out_file:
seen = set()
for line in in_file:
if line in seen: continue # skip duplicate
seen.add(line)
out_file.write(line)
I believe there should be some pandas or other modules approaches, but here is a pure pythonic approach:
with open("master.csv") as f:
master = f.read()
with open("exclude.csv") as f:
exclude = f.read()
master = master.strip().split("\n")
exclude = exclude.strip().split("\n")
returnList = []
for line in master:
check = True
for exc in exclude:
if exc in line:
check = False
break
if check:
returnList.append(line)
with open("master.csv", "w") as f:
f.write("\n".join(returnList))
Output of master.csv
abc,xyz
ijk,lmn
Simplest way by using pandas..
import pandas as pd
# Reading the csv file
df_new = pd.read_csv('Names.csv')
# saving xlsx file
GFG = pd.ExcelWriter('Names.xlsx')
df_new.to_excel(GFG, index=False)
GFG.save()
A purely pythonic answer leveraging list comprehensions:
with open('master.csv', 'r') as f:
keep_lines = f.readlines()
with open('exclude.csv', 'r') as f:
drop_lines = f.readlines()
write_lines = [line[0] for line in zip(keep_lines, drop_lines) if line[0].strip().split(',')[0] != line[1].strip()]
with open('master.csv', 'w') as f:
f.writelines(write_lines)
You can use pandas like this:
import pandas as pd
master_df=pd.read_csv('master.csv')
exclude_df=pd.read_csv('exclude.csv')
conc=pd.concat([master_df,exclude_df]) #concatenate two dataframe
conc.drop_duplicates(subset=['col1'],inplace=True,keep=False)
print(conc)
drop_duplicates with subset = col1 will check for duplicate in col1 only
and keep has 3 values allowed:first,last and False...
i have chosen keep=False to not keep any duplicate
Dataset:
master.csv:
col1,col2
abc,xyz
cde,fgh
ijk,lmn
exclude.csv:
col1
cde

How to split one csv into multiple files in python

I have a csv file (world.csv) looks like this :
"city","city_alt","lat","lng","country"
"Mjekić","42.6781","20.9728","Kosovo"
"Mjekiff","42.6781","20.9728","Kosovo"
"paris","42.6781","10.9728","France"
"Bordeau","16.6781","52.9728","France"
"Menes","02.6781","50.9728","Morocco"
"Fess","6.6781","3.9728","Morocco"
"Tanger","8.6781","5.9728","Morocco"
And i want to split it to multiple file by country like this:
Kosovo.csv :
"city","city_alt","lat","lng","country"
"Mjekić","42.6781","20.9728","Kosovo"
"Mjekiff","42.6781","20.9728","Kosovo"
France.csv :
"city","city_alt","lat","lng","country"
"paris","42.6781","10.9728","France"
"Bordeau","16.6781","52.9728","France"
Morroco.csv :
"city","city_alt","lat","lng","country"
"Menes","02.6781","50.9728","Morocco"
"Fess","6.6781","3.9728","Morocco"
"Tanger","8.6781","5.9728","Morocco"
If you can't use pandas you can use the built-in csv module and itertools.groupby() function. You can use this to group by country.
from itertools import groupby
import csv
with open('world.csv') as csv_file:
reader = csv.reader(csv_file)
next(reader) #skip header
#Group by column (country)
lst = sorted(reader, key=lambda x : x[4])
groups = groupby(lst, key=lambda x : x[4])
#Write file for each country
for k,g in groups:
filename = k + '.csv'
with open(filename, 'w', newline='') as fout:
csv_output = csv.writer(fout)
csv_output.writerow(["city","city_alt","lat","lng","country"]) #header
for line in g:
csv_output.writerow(line)
try this:
filter the columns based on the country name. Then convert that to csv file using to_csv in pandas
df = pd.read_csv('test.csv')
france = df[df['country']=='France']
kosovo = df[df['country']=='Kosovo']
morocco = df[df['country']=='Morocco']
france.to_csv('france.csv', index=False)
kosovo.to_csv('kosovo.csv', index=False)
morocco.to_csv('morocco.csv', index=False)
The easiest way to do this is as below:
#create a folder called "adata" for example in your working directory
#import glob
for i,g in df.groupby('CITY'):
g.to_csv('adata\{}.csv'.format(i), header=True, index_label='Index')
print(glob.glob('adata\*.csv'))
filenames = sorted(glob.glob('adata\*.csv'))
for f in filenames:
#your intended processes

Find element in set from reading file line

I have the text file with delimiter |: file1.txt
ID|Name|Date
1|A|2017-12-19
2|B|2017-12-20
3|C|2017-12-21
And following SET: <type 'set'>
id_set = set(['1','2'])
date_set = set(['2017-12-19', '2017-12-20'])
I just want to find the matching element from set to file and write that record from file1.txt to output.txt.
Expected Output: Output.txt should get following data,
ID|Name|Date
1|A|2017-12-19
2|B|2017-12-20
You can try out this solution:
id_set = {'1','2'}
date_set = {'2017-12-19', '2017-12-20'}
# open files for reading and writing
with open('file.txt') as in_file, open('output.txt', 'w') as out_file:
# write headers
out_file.write(next(in_file))
# go over lines in file
for line in in_file:
# extract id and date
id, _, date = line.rstrip().split('|')
# keep lines have an id or date in the sets
if id in id_set or date in date_set:
out_file.write(line)
Which gives the following output.txt:
ID|Name|Date
1|A|2017-12-19
2|B|2017-12-20
If you are happy to use a 3rd party library, you can use Pandas:
import pandas as pd
from io import StringIO
mystr = StringIO("""ID|Name|Date
1|A|2017-12-19
2|B|2017-12-20
3|C|2017-12-21""")
# replace mystr with 'file1.txt'
df = pd.read_csv(mystr, sep='|')
# criteria
id_set = {'1', '2'}
date_set = {'2017-12-19', '2017-12-20'}
# apply criteria
df2 = df[df['ID'].astype(str).isin(id_set) | df['Date'].isin(date_set)]
print(df2)
# ID Name Date
# 0 1 A 2017-12-19
# 1 2 B 2017-12-20
# export to csv
df2.to_csv('file1_out.txt', sep='|')

Categories

Resources