Split CSV file which contains multiple tables into different pandas dataFrames (Python) - python

I have multiple CSV files which are formatted with multiple tables inside separated by line breaks.
Example:
Technology C_inv [MCHF/y] C_maint [MCHF/y]
NUCLEAR 70.308020 33.374568
HYDRO_DAM_EXISTING 0.000000 195.051200
HYDRO_DAM 67.717942 1.271600
HYDRO_RIVER_EXISTING 0.000000 204.820000
IND_BOILER_OIL 2.053610 0.532362
IND_BOILER_COAL 4.179935 1.081855
IND_BOILER_WASTE 11.010126 2.849652
DEC_HP_ELEC 554.174644 320.791276
DEC_THERMAL_HP_GAS 77.077291 33.717477
DEC_BOILER_GAS 105.586089 41.161335
DEC_BOILER_OIL 33.514266 25.948450
H2_FROM_GAS 145.185290 59.178082
PYROLYSIS 132.200818 112.392123
Storage technology C_inv [MCHF/y] C_maint [MCHF/y]
HYDRO_STORAGE 0.000000 0.000000
Resource C_op [MCHF/y]
ELECTRICITY 1174.452848
GASOLINE 702.000000
DIESEL 96.390000
OIL 267.787558
NG 1648.527242
WOOD 592.110000
COAL 84.504083
URANIUM 18.277626
WASTE 0.000000
All my CSV files have different subtable names but few enough that I could enter them manually to detect them if required.
Another issue is that many titles include spaces (eg "Storage Technology") which is read by pandas as 2 columns.
I initially tried to do it directly with pandas and splitting manually but the argument on_bad_lines='skip' which allows avoiding errors also skips useful lines:
Cost_bd = pd.read_csv(f"{Directory}/cost_breakdown.csv",on_bad_lines='skip',delim_whitespace=True).dropna(axis=1,how='all')
colnames=['Technnolgy', 'C_inv[MCHF/y]', 'C_maint[MCHF/y]']
Cost_bd.columns = colnames
I believe it might be better to scan the .txt file and split it but I'm unsure how to do this in the best way.
I have also tried to use the solution provided in this feed
import csv
from os.path import dirname # gets parent folder in a path
from os.path import join # concatenate paths
table_names = ["Technology", "Storage technology", "Resource"]
df = pd.read_csv(f"{Directory}/cost_breakdown.csv", header=None, names=range(3))
groups = df[0].isin(table_names).cumsum()
tables = {g.iloc[0,0]: g.iloc[1:] for k,g in df.groupby(groups)}
but it doesn't work:
tables.keys()=
dict_keys(['Technology\tC_inv [MCHF/y]\tC_maint [MCHF/y]'])
EDIT : Final solution based on #Rabinzel:
import re
def make_df(group,dict_of_dfs):
header, data = re.split(r'\t',group[0]), list(map(str.split, group[1:]))
if len(header) != len(data[0]): # If missing columns list, take former
header = header + dict_of_dfs[list(dict_of_dfs.keys())[0]].columns.tolist()[1:]
dict_of_dfs[header[0]] = pd.DataFrame(data, columns=header)
return dict_of_dfs
def Read_csv_as_df(path, file_name):
with open(path+file_name) as f:
dict_of_dfs = {}
group = []
for line in f:
if line!='\n':
group.append(line.strip())
else:
print(dict_of_dfs)
dict_of_dfs = make_df(group,dict_of_dfs)
group = []
dict_of_dfs = make_df(group,dict_of_dfs)
return dict_of_dfs

I would do it the following way.
Iterate through each row, append each chunk seperated by a newline to a list and build dataframes from the lists. The problem with the column names with spaces, use re.split and split only if there are two or more spaces.
Save the different df's in a dictionary where the key is the first element of the header of each df.
import re
def make_df(group):
header, data = re.split(r'\s\s+',group[0]), list(map(str.split, group[1:]))
dict_of_dfs[header[0]] = pd.DataFrame(data, columns=header)
with open('your_csv_file.csv') as f:
dict_of_dfs = {}
group = []
for line in f:
if line!='\n':
group.append(line.strip())
else:
make_df(group)
group = []
make_df(group)
for key, value in dict_of_dfs.items():
print(f"{key=}\ndf:\n{value}\n---------------------")
Output:
key='Technology'
df:
Technology C_inv [MCHF/y] C_maint [MCHF/y]
0 NUCLEAR 70.308020 33.374568
1 HYDRO_DAM_EXISTING 0.000000 195.051200
2 HYDRO_DAM 67.717942 1.271600
3 HYDRO_RIVER_EXISTING 0.000000 204.820000
4 IND_BOILER_OIL 2.053610 0.532362
5 IND_BOILER_COAL 4.179935 1.081855
6 IND_BOILER_WASTE 11.010126 2.849652
7 DEC_HP_ELEC 554.174644 320.791276
8 DEC_THERMAL_HP_GAS 77.077291 33.717477
9 DEC_BOILER_GAS 105.586089 41.161335
10 DEC_BOILER_OIL 33.514266 25.948450
11 H2_FROM_GAS 145.185290 59.178082
12 PYROLYSIS 132.200818 112.392123
---------------------
key='Storage technology'
df:
Storage technology C_inv [MCHF/y] C_maint [MCHF/y]
0 HYDRO_STORAGE 0.000000 0.000000
---------------------
key='Resource'
df:
Resource C_op [MCHF/y]
0 ELECTRICITY 1174.452848
1 GASOLINE 702.000000
2 DIESEL 96.390000
3 OIL 267.787558
4 NG 1648.527242
5 WOOD 592.110000
6 COAL 84.504083
7 URANIUM 18.277626
8 WASTE 0.000000
---------------------

Related

Parsing idx file or string to Pandas DataFrame

I would like to parse the following idx file: https://www.sec.gov/Archives/edgar/daily-index/2022/QTR1/company.20220112.idx into Pandas DataFrame.
I use the following code to check how it would look like as a text file:
import os, requests
base_path = '/Users/GunardiLin/Desktop/Insider_Ranking/temp/'
current_dirs = os.listdir(path=base_path)
local_filename = f'20200102'
local_file_path = '/'.join([base_path, local_filename])
if local_filename in base_path:
print(f'Skipping index file for {local_filename} because it is already saved.')
url = f'https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/company.20200102.idx'
r = requests.get(url, stream=True, headers= {'user-agent': 'MyName myname#outlook.com'})
with open(local_file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=10240):
f.write(chunk)
Next I would like to build a parser that is fault tollerance, because it should parse daily a new idx file into pd.DataFrame.
My idea was to use string manipulation, but it would be very complicated and not fault tollerance.
I would be thankful if someone can show the best practice to parse and give a boilerplate code.
Since this is mostly a fixed width file you could use pandas read_fwf to read this file. You can skip over the leading information (via skiprows=) and get straight to the data. The column names are predefined and assigned when read:
idx_path = 'company.20220112.idx'
names = ['Company Name','Form Type','CIK','Date Filed','File Name']
df = pd.read_fwf(idx_path, colspecs=[(0,61),(62,74),(74,84),(86,94),(98,146)], names=names, skiprows=11)
df.head(10)
Company Name Form Type CIK Date Filed File Name
0 005 - Series of IPOSharks Venture Master Fund,... D 1888451 20220112 edgar/data/1888451/0001888451-22-000002.txt
1 10X Capital Venture Acquisition Corp. III EFFECT 1848948 20220111 edgar/data/1848948/9999999995-22-000102.txt
2 110 White Partners LLC D 1903845 20220112 edgar/data/1903845/0001884293-22-000001.txt
3 15 Beach, MHC 3 1903509 20220112 edgar/data/1903509/0001567619-22-001073.txt
4 15 Beach, MHC SC 13D 1903509 20220112 edgar/data/1903509/0000943374-22-000014.txt
5 170 Valley LLC D 1903913 20220112 edgar/data/1903913/0001903913-22-000001.txt
6 1st FRANKLIN FINANCIAL CORP 424B3 38723 20220112 edgar/data/38723/0000038723-22-000003.txt
7 1st FRANKLIN FINANCIAL CORP 424B3 38723 20220112 edgar/data/38723/0000038723-22-000004.txt
8 215 BF Associates LLC D 1904145 20220112 edgar/data/1904145/0001904145-22-000001.txt
9 2401 Midpoint Drive REIT, LLC D 1903337 20220112 edgar/data/1903337/0001903337-22-000001.txt

How to read a file containing more than one record type within?

I have a .csv file that contains 3 types of records, each with different quantity of columns.
I know the structure of each record type and that the rows are always of type1 first, then type2 and type 3 at the end, but I don't know how many rows of each record type there are.
The first 4 characters of each row define the record type of that row.
CSV Example:
typ1,John,Smith,40,M,Single
typ1,Harry,Potter,22,M,Married
typ1,Eva,Adams,35,F,Single
typ2,2020,08,16,A
typ2,2020,09,02,A
typ3,Chevrolet,FC101TT,2017
typ3,Toyota,CE972SY,2004
How can I read It with Pandas? It doesn't matter if I have to read one record type each time.
Thanks!!
Here it is a pandas solution.
First we must read the csv file in a way that pandas keeps the entires lines in one cell each. We do that by simply using a wrong separator, such as the 'at' symbol '#'. It can be whatever we want, since we guarantee it won't ever appear in our data file.
wrong_sep = '#'
right_sep = ','
df = pd.read_csv('my_file.csv', sep=wrong_sep).iloc[:, 0]
The .iloc[:, 0] is used as a quick way to convert a DataFrame into a Series.
Then we use a loop to select the rows that belong to each data structure based on their starting characters. Now we use the "right separator" (probably a comma ',') to split the desired data into real DataFrames.
starters = ['typ1', 'typ2', 'typ3']
detected_dfs = dict()
for start in starters:
_df = df[df.str.startswith(start)].str.split(right_sep, expand=True)
detected_dfs[start] = _df
And here you go. If we print the resulting DataFrames, we get:
0 1 2 3 4 5
0 typ1 Harry Potter 22 M Married
1 typ1 Eva Adams 35 F Single
0 1 2 3 4
2 typ2 2020 08 16 A
3 typ2 2020 09 02 A
0 1 2 3
4 typ3 Chevrolet FC101TT 2017
5 typ3 Toyota CE972SY 2004
Let me know if it helped you!
Not Pandas:
from collections import defaultdict
filename2 = 'Types.txt'
with open(filename2) as dataLines:
nL = dataLines.read().splitlines()
defDList = defaultdict(list)
subs = ['typ1','typ2','typ3']
dataReadLines = [defDList[i].append(j) for i in subs for j in nL if i in j]
# dataReadLines = [i for i in nL]
print(defDList)
Output:
defaultdict(<class 'list'>, {'typ1': ['typ1,John,Smith,40,M,Single', 'typ1,Harry,Potter,22,M,Married', 'typ1,Eva,Adams,35,F,Single'], 'typ2': ['typ2,2020,08,16,A', 'typ2,2020,09,02,A'], 'typ3': ['typ3,Chevrolet,FC101TT,2017', 'typ3,Toyota,CE972SY,2004']})
You can make use of the skiprows parameter of pandas read_csv method to skip the rows you are not interested in for a particular record type. The following gives you a dictionary dfs of dataframes for each type. An advantage is that records of the same types don't necessarily have to be adjacent to each other in the csv file.
For larger files you might want to adjust the code such that the file is only read once instead of twice.
import pandas as pd
from collections import defaultdict
indices = defaultdict(list)
types = ['typ1', 'typ2', 'typ3']
filename = 'test.csv'
with open(filename) as csv:
for idx, line in enumerate(csv.readlines()):
for typ in types:
if line.startswith(typ):
indices[typ].append(idx)
dfs = {typ: pd.read_csv(filename, header=None,
skiprows=lambda x: x not in indices[typ])
for typ in types}
Read the file as a CSV file using the CSV reader. The reader fortunately does not care about line formats:
import csv
with open("yourfile.csv") as infile:
data = list(csv.reader(infile))
Collect the rows with the same first element and build a dataframe of them:
import pandas as pd
from itertools import groupby
dfs = [pd.DataFrame(v) for _,v in groupby(data, lambda x: x[0])]
You've got a list of three dataframes (or as many as necessary).
dfs[1]
# 0 1 2 3 4
#0 typ2 2020 08 16 A
#1 typ2 2020 09 02 A

How to efficiently concatenate columns from different csv files based on ids python

I have 4 csv files. Each file has different fields, e.g. name, id_number, etc. Each file is talking about the same thing, for which there is a unique id that each file has. So, I would like to concatenate the fields of each of the 4 files into a single DataFrame. For instance, one file contains first_name, another file contains last_name, then I want to merge those two, so that I can have first and last name for each object.
Doing that is trivial, but I'd like to know the most efficient way, or if there is some built-in function that does it very efficiently.
The files look something like this:
file1:
id name age pets
b13 Marge 18 cat
y47 Dan 13 dog
h78 Mark 20 lizard
file2:
id last_name income city
y47 Schmidt 1800 Dallas
b13 Olson 1670 Paris
h78 Diaz 2010 London
file 3 and 4 are like that with different fields. The ids are not necessarily ordered. The goal again, is to have one DataFrame looking like this:
id name age pets last_name income city
b13 Marge 18 cat Olson 1670 Paris
y47 Dan 13 dog Schmidt 1800 Dallas
h78 Mark 20 lizard Diaz 2010 London
What I've done is this:
file1 = pd.read_csv('file1.csv')
file2 = pd.read_csv('file2.csv')
file3 = pd.read_csv('file3.csv')
file4 = pd.read_csv('file4.csv')
f1_group = file1.groupby(['id'])
f2_group = file2.groupby(['id'])
f3_group = file3.groupby(['id'])
f4_group = file4.groupby(['id'])
data = []
for id1, group1 in f1_group:
for id2, group2 in f2_group:
for id3, group3 in f3_group:
for id4, group4 in f4_group:
if id1 == id2 == id3 == id4:
frames = [group1, group2, group3, group4]
con = pd.concat(frames, axis=1)
data.append(con)
That works but is extremely inefficient. If I could eliminate the element that has been already considered from group1, group2, etc, that would help, but it would still be inefficient.
Thanks in advance.
Hi maybe you can try this :)
https://www.freecodecamp.org/news/how-to-combine-multiple-csv-files-with-8-lines-of-code-265183e0854/
import os
import glob
import pandas as pd
#set working directory
os.chdir("/mydir")
#find all csv files in the folder
#use glob pattern matching -> extension = 'csv'
#save result in list -> all_filenames
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#print(all_filenames)
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')

Dividing single pandas dataframe into multiple csv files with predefined naming convention

I am facing an issue where I have to load a huge CSV file, split the file into multiple files based on the unique values in the columns and outputting the files to a multiple Csv's with a predefined name pattern.
The example of the original CSV is as below.
date place type product value zone
09/10/16 NY Zo shirt 19 1
09/10/16 NY Mo jeans 18 2
09/10/16 CA Zo trouser 13 3
09/10/16 CA Co tie 17 4
09/10/16 WA Wo bat 11 1
09/10/16 FL Zo ball 12 2
09/10/16 NC Mo belt 13 3
09/10/16 WA Zo buckle 15 4
09/10/16 WA Co glass 16 1
09/10/16 FL Zo cup 19 2
I have to filer this massive pandas dataframe into multiple pandas dataframes based on place, type and zone and the output dataframes should be converted into multiple csv file with the naming convention place_type_product_zone.csv.
The code I have got till now is as below.
def list_of_dataframes(df, col_list):
df_list = [df]
name_list = []
for _, i in enumerate(col_list):
df_list, names = _split_dataframes(df_list, i)
file_name = zip(name_list, df)
_ = dict(zip(names, df))
for k, v in _:
v.to_csv("{0}.csv".format(k))
Print("CSV files created")
return df, file_name
def _split_dataframes(df_list, col):
names = []
dfs = []
for df in df_list:
for c in df[col].unique():
dfs.append(df.loc[df[col] == c])
names.append(c)
return dfs, names
list_of_dataframes(df,['place','type','zone']
It output csv files with the title 1.csv, 2.csv etc. How do I create a loop in the function to get the naming convention as NY_zo_shirt_1.csv, CA_Zo_trouser_3.csv etc. should I be creating a dictionary where it stores all the keys?
Thanks in advance.
Here it is -
# Part 1
places = df['place'].unique()
types = df['type'].unique()
products = df['product'].unique()
zones = df['zone'].unique()
# Part 2
import itertools
combs = list(itertools.product(*[places, types, products, zones]))
#Part 3
for comb in combs:
place, type_, prod, zone = comb
df_subset = df[(df['place']==place) & (df['type']==type_) & (df['product']==prod) & (df['zone']==zone)]
if df_subset.shape[0] > 0:
df_subset.to_csv('temp1/{}_{}_{}_{}.csv'.format(place, type_, prod, zone), index=False)
Output

How to loop over multiple DataFrames and produce multiple csv?

making a change from R to Python I have some difficulties to write multiple csv using pandas from a list of multiple DataFrames:
import pandas
from dplython import (DplyFrame, X, diamonds, select, sift, sample_n,
sample_frac, head, arrange, mutate, group_by, summarize,
DelayFunction)
diamonds = [diamonds, diamonds, diamonds]
path = "/user/me/"
def extractDiomands(path, diamonds):
for each in diamonds:
df = DplyFrame(each) >> select(X.carat, X.cut, X.price) >> head(5)
df = pd.DataFrame(df) # not sure if that is required
df.to_csv(os.path.join('.csv', each))
extractDiomands(path,diamonds)
That however generates an errors. Appreciate any suggestions!
Welcome to Python! First I'll load a couple libraries and download an example dataset.
import os
import pandas as pd
example_data = pd.read_csv("http://www.ats.ucla.edu/stat/data/binary.csv")
print(example_data.head(5))
first few rows of our example data:
admit gre gpa rank
0 0 380 3.61 3
1 1 660 3.67 3
2 1 800 4.00 1
3 1 640 3.19 4
4 0 520 2.93 4
Now here's what I think you want done:
# spawn a few datasets to loop through
df_1, df_2, df_3 = example_data.head(20), example_data.tail(20), example_data.head(10)
list_of_datasets = [df_1, df_2, df_3]
output_path = 'scratch'
# in Python you can loop through collections of items directly, its pretty cool.
# with enumerate(), you get the index and the item from the sequence, each step through
for index, dataset in enumerate(list_of_datasets):
# Filter to keep just a couple columns
keep_columns = ['gre', 'admit']
dataset = dataset[keep_columns]
# Export to CSV
filepath = os.path.join(output_path, 'dataset_'+str(index)+'.csv')
dataset.to_csv(filepath)
At the end, my folder 'scratch' has three new csv's called dataset_0.csv, dataset_1.csv, and dataset_2.csv

Categories

Resources