How to split one csv into multiple files in python - python

I have a csv file (world.csv) looks like this :
"city","city_alt","lat","lng","country"
"Mjekić","42.6781","20.9728","Kosovo"
"Mjekiff","42.6781","20.9728","Kosovo"
"paris","42.6781","10.9728","France"
"Bordeau","16.6781","52.9728","France"
"Menes","02.6781","50.9728","Morocco"
"Fess","6.6781","3.9728","Morocco"
"Tanger","8.6781","5.9728","Morocco"
And i want to split it to multiple file by country like this:
Kosovo.csv :
"city","city_alt","lat","lng","country"
"Mjekić","42.6781","20.9728","Kosovo"
"Mjekiff","42.6781","20.9728","Kosovo"
France.csv :
"city","city_alt","lat","lng","country"
"paris","42.6781","10.9728","France"
"Bordeau","16.6781","52.9728","France"
Morroco.csv :
"city","city_alt","lat","lng","country"
"Menes","02.6781","50.9728","Morocco"
"Fess","6.6781","3.9728","Morocco"
"Tanger","8.6781","5.9728","Morocco"

If you can't use pandas you can use the built-in csv module and itertools.groupby() function. You can use this to group by country.
from itertools import groupby
import csv
with open('world.csv') as csv_file:
reader = csv.reader(csv_file)
next(reader) #skip header
#Group by column (country)
lst = sorted(reader, key=lambda x : x[4])
groups = groupby(lst, key=lambda x : x[4])
#Write file for each country
for k,g in groups:
filename = k + '.csv'
with open(filename, 'w', newline='') as fout:
csv_output = csv.writer(fout)
csv_output.writerow(["city","city_alt","lat","lng","country"]) #header
for line in g:
csv_output.writerow(line)

try this:
filter the columns based on the country name. Then convert that to csv file using to_csv in pandas
df = pd.read_csv('test.csv')
france = df[df['country']=='France']
kosovo = df[df['country']=='Kosovo']
morocco = df[df['country']=='Morocco']
france.to_csv('france.csv', index=False)
kosovo.to_csv('kosovo.csv', index=False)
morocco.to_csv('morocco.csv', index=False)

The easiest way to do this is as below:
#create a folder called "adata" for example in your working directory
#import glob
for i,g in df.groupby('CITY'):
g.to_csv('adata\{}.csv'.format(i), header=True, index_label='Index')
print(glob.glob('adata\*.csv'))
filenames = sorted(glob.glob('adata\*.csv'))
for f in filenames:
#your intended processes

Related

Split CSV into multiple files based on column value

I have a poorly-structured CSV file named file.csv, and I want to split it up into multiple CSV using Python.
|A|B|C|
|Continent||1|
|Family|44950|file1|
|Species|44950|12|
|Habitat||4|
|Species|44950|22|
|Condition|Tue Jan 24 00:00:00 UTC 2023|4|
|Family|Fish|file2|
|Species|Bass|8|
|Species|Trout|2|
|Habitat|River|3|
The new files need to be separated based on everything between the Family rows, so for example:
file1.csv
|A|B|C|
|Continent||1|
|Family|44950|file1|
|Species|44950|12|
|Habitat||4|
|Species|44950|22|
|Condition|Tue Jan 24 00:00:00 UTC 2023|4|
file2.csv
|A|B|C|
|Continent||1|
|Family|Fish|file2|
|Species|Bass|8|
|Species|Trout|2|
|Habitat|River|3|
What's the best way of achieving this when the number of rows between appearances of Species is not consistent?
If your file really looks like that ;) then you could use groupby from the standard library module itertools:
from itertools import groupby
def key(line): return line.startswith("|Family|")
family_line, file_no = None, 0
with open("file.csv", "r") as fin:
for is_family_line, lines in groupby(fin, key=key):
if is_family_line:
family_line = list(lines).pop()
elif family_line is None:
header = "".join(lines)
else:
file_no += 1
with open(f"file{file_no}.csv", "w") as fout:
fout.write(header + family_line)
for line in lines:
fout.write(line)
A Pandas solution would be:
import pandas as pd
df = pd.read_csv("file.csv", header=None, delimiter="|").fillna("")
blocks = df.iloc[:, 1].eq("Family").cumsum()
header_df = df[blocks.eq(0)]
for no, sdf in df.groupby(blocks):
if no > 0:
sdf = pd.concat([header_df, sdf])
sdf.to_csv(f"file{no}.csv", index=False, header=False, sep="|")
import pandas as pd
pd.read_csv('file.csv',delimiter='|')
groups = df.groupby('Family')
for name, group in groups:
    group.to_csv(name + '.csv', index=False)
Here is a pure python working method:
# Read file
with open('file.csv', 'r') as file:
text = file.read()
# Split using |Family|
splitted_text = text.split("|Family|")
# Remove unwanted content before first |Family|
splitted_text = splitted_text[1:]
# Add |Family| back to each part
splitted_text = ['|Family|' + item for item in splitted_text]
# Write files
for i, content in enumerate(splitted_text ):
with open('file{}.csv'.format(i), 'w') as file:
file.write(content)

Read CSV file with quotechar-comma combination in string - Python

I have got multiple csv files which look like this:
ID,Text,Value
1,"I play football",10
2,"I am hungry",12
3,"Unfortunately",I get an error",15
I am currently importing the data using the pandas read_csv() function.
df = pd.read_csv(filename, sep = ',', quotechar='"')
This works for the first two rows in my csv file, unfortunately I get an error in row 3. The reason is that within the 'Text' column there is a quotechar character-comma combination before the end of the column.
ParserError: Error tokenizing data. C error: Expected 3 fields in line 4, saw 4
Is there a way to solve this issue?
Expected output:
ID Text Value
1 I play football 10
2 I am hungry 12
3 Unfortunately, I get an error 15
You can try to fix the CSV using re module:
import re
import pandas as pd
from io import StringIO
with open("your_file.csv", "r") as f_in:
s = re.sub(
r'"(.*)"',
lambda g: '"' + g.group(1).replace('"', "\\") + '"',
f_in.read(),
)
df = pd.read_csv(StringIO(s), sep=r",", quotechar='"', escapechar="\\")
print(df)
Prints:
ID Text Value
0 1 I play football 10
1 2 I am hungry 12
2 3 Unfortunately,I get an error 15
One (not so flexible) approach would be to firstly remove all " quotes from the csv, and then enclose the elements of the specific column with "" quotes(this is done to avoid misinterpreting the "," seperator while parsing), like this:
import csv
# Specify the column index (0-based)
column_index = 1
# Open the input CSV file
with open('input.csv', 'r') as f:
reader = csv.reader(f)
# Open the output CSV file
with open('output.csv', 'w', newline='') as g:
writer = csv.writer(g)
# Iterate through the rows of the input CSV file
for row in reader:
# Replace the " character with an empty string
row[column_index] = row[column_index].replace('"', '')
# Enclose the modified element in "" quotes
row[column_index] = f'"{row[column_index]}"'
# Write the modified row to the output CSV file
writer.writerow(row)
This code creates a new modified csv file
Then your problematic csv row will look like that:
3,"Unfortunately,I get an error",15"
Then you can import the data like you did: df = pd.read_csv(filename, sep = ',', quotechar='"')
To automate this conversion for all csv files within a directory:
import csv
import glob
# Specify the column index (0-based)
column_index = 1
# Get a list of all CSV files in the current directory
csv_files = glob.glob('*.csv')
# Iterate through the CSV files
for csv_file in csv_files:
# Open the input CSV file
with open(csv_file, 'r') as f:
reader = csv.reader(f)
# Open the output CSV file
output_file = csv_file.replace('.csv', '_new.csv')
with open(output_file, 'w', newline='') as g:
writer = csv.writer(g)
# Iterate through the rows of the input CSV file
for row in reader:
# Replace the " character with an empty string
row[column_index] = row[column_index].replace('"', '')
# Enclose the modified element in "" quotes
row[column_index] = f'"{row[column_index]}"'
# Write the modified row to the output CSV file
writer.writerow(row)
this names the new csv files as the old ones but with "_new.csv" instead of just ".csv".
A possible solution:
df = pd.read_csv(filename, sep='(?<=\d),|,(?=\d)', engine='python')
df = df.reset_index().set_axis(['ID', 'Text', 'Value'], axis=1)
df['Text'] = df['Text'].replace('\"', '', regex=True)
Another possible solution:
df = pd.read_csv(StringIO(text), sep='\t')
df[['ID', 'Text']] = df.iloc[:, 0].str.split(',', expand=True, n=1)
df[['Text', 'Value']] = df['Text'].str.rsplit(',', expand=True, n=1)
df = df.drop(df.columns[0], axis=1).assign(
Text=df['Text'].replace('\"', '', regex=True))
Output:
ID Text Value
0 1 I play football 10
1 2 I am hungry 12
2 3 Unfortunately,I get an error 15

How do I sort a csv file so that my columns are in descending order?

I'm trying to sort my CSV file so that the teams with the most gold, silver and bronze medals are at the top of the list and those with the least are at the bottom.
def rank_team(file_name):
import csv
file = open('medal.csv')
for line in file:
print(line)
pass
rank_team('medal.csv')
This is the code that I have now. Just wondering how I would sort it all.
This is the CSV that I'm using.
CSV used to sort
Using the csv.reader and csv.writer functions, as well as sorted with a tuple key:
import csv
with open('medal.csv', 'r') as in_file:
in_reader = csv.reader(in_file)
header = next(in_reader)
data = sorted(in_reader, key=lambda row: tuple(int(x) for x in row[1:]), reverse=True)
with open('sorted_medal.csv', 'w', newline='') as out_file:
out_writer = csv.writer(out_file)
out_writer.writerow(header)
out_writer.writerows(data)
Result:
# Input: medal.csv
team,gold,silver,bronze
t1,17,12,38
t2,8,7,29
t3,17,11,39
t4,17,12,37
t5,8,9,30
# Output: sorted_medal.csv
team,gold,silver,bronze
t1,17,12,38
t4,17,12,37
t3,17,11,39
t5,8,9,30
t2,8,7,29
you can use sorted function with key condition.
if you want to sort ascending [Gold,Silver,Bronze], then this code will help you.
import csv
def rank_team(file_name):
with open('medal.csv') as f:
reader = csv.reader(f)
header = next(reader)
data = [row for row in reader]
print(header)
print(sorted(data, key=lambda x: (x[1], x[2], x[3])))
rank_team('medal.csv')
You can use pandas for this. Read the csv as a pd.DataFrame and use sort_values method:
import pandas as pd
df = pd.read_csv('medal.csv')
df = df.sort_values(by=['Gold','Silver','Bronze'], ascending=False)
Note: What you describe is descending order.
Here a link that might help you . Change the 3 by 0 since you want to use the first column.

Select specific columns from CSV file

My code is able to get the 28 columns of a text file and format/remove some data. How Can I select specific columns? The columns I want are 0 to 25, and column 28. What is the best approach?
Thanks in advance!
import csv
import os
my_file_name = os.path.abspath('NVG.txt')
cleaned_file = "cleanNVG.csv"
remove_words = ['INAC-EIM','-INAC','TO-INAC','TO_INAC','SHIP_TO-inac','SHIP_TOINAC']
with open(my_file_name, 'r', newline='') as infile, open(cleaned_file, 'w',newline='') as outfile:
writer = csv.writer(outfile)
cr = csv.reader(infile, delimiter='|')
writer.writerow(next(cr)[:28])
for line in (r[0:28] for r in cr):
if not any(remove_word in element for element in line for remove_word in remove_words):
line[11]= line[11][:5]
writer.writerow(line)
infile.close()
outfile.close()
Have a look at pandas.
import pandas as pd
usecols = list(range(26)) + [28]
data = pd.read_csv(my_file_name, usecols=usecols)
You can also conveniently write the data back to a new file
with open(cleaned_file, 'w') as f:
data.to_csv(f)
exclude column 26 and column27 from row using filter():
for row in cr:
content = list(filter(lambda x: row.index(x) not in [25,26], row))
# work with the selected columns content

How to merge vertically several csv files in Python?

I need to merge vertically the data from several CSV spreadsheets in Python. Their structure is identical, I just need to put one table's data on top of the following because they are the months on an annual survey. I tried several methods I found googling but I can't find a way to do something as simple as:
import csv
spreadsheets1 = open('0113_RE_fscom.csv','r')
spreadsheets2 = open('0213_RE_fscom.csv','r')
spreadsheets = spreadsheets1 + spreadsheets2
with spreadsheet as csvfile:
sales = csv.reader(csvfile)
for row in sales:
print row
Looks like you simply forgot to iterate over files. Try this code:
import csv
spreadsheet_filenames = [
'0113_RE_fscom.csv',
'0213_RE_fscom.csv',
]
for filename in spreadsheet_filenames:
with open(filename, 'r') as csvfile:
sales = csv.reader(csvfile)
for row in sales:
print row
how about this:
import csv
from itertools import izip
with open('0113_RE_fscom.csv', 'r') as f1, open('0113_RE_fscom.csv', 'r') as f2:
csv1 = csv.reader(f1, delimiter=',')
csv2 = csv.reader(f2, delimiter=',')
for line1, line2 in izip(csv1, csv2):
print line1 + line2
This is quite simple with pandas.
import pandas as pd
f1 = pd.read_csv('0113_RE_fscom.csv', header=None)
f2 = pd.read_csv('0213_RE_fscom.csv', header=None)
merged = pd.concat(f1, f2)
merged.to_csv('merged.csv', index=None, header=None)
Remove header=None if your files actually do have a header.

Categories

Resources