Convert list of multiple strings into a Python data frame - python

I have a list of string values I read this from a text document with splitlines. which yields something like this
X = ["NAME|Contact|Education","SMITH|12345|Graduate","NITA|11111|Diploma"]
I have tried this
for i in X:
textnew = i.split("|")
data[x] = textnew
I want to make a dataframe out of this
Name Contact Education
SMITH 12345 Graduate
NITA 11111 Diploma

You can read it directly from your file by specifying a sep argument to pd.read_csv.
df = pd.read_csv("/path/to/file", sep='|')
Or if you wish to convert it from list of string instead:
data = [row.split('|') for row in X]
headers = data.pop(0) # Pop the first element since it's header
df = pd.DataFrame(data, columns=headers)

you had it almost correct actually, but don't use data as dictionary(by using keys - data[x] = textnew):
X = ["NAME|Contact|Education","SMITH|12345|Graduate","NITA|11111|Diploma"]
df = []
for i in X:
df.append(i.split("|"))
print(df)
# [['NAME', 'Contact', 'Education'], ['SMITH', '12345', 'Graduate'], ['NITA', '11111', 'Diploma']]
Depends on further transformations, but pandas might be overkill for this kind of task

Here is a solution for your problem
import pandas as pd
X = ["NAME|Contact|Education","SMITH|12345|Graduate","NITA|11111|Diploma"]
data = []
for i in X:
data.append( i.split("|") )
df = pd.DataFrame( data, columns=data.pop(0))

In your situation, you can avoid to load the file using readlines and use pandas for take care about loading the file:
As mentioned above, the solution is a standard read_csv:
import os
import pandas as pd
path = "/tmp"
filepath = "file.xls"
filename = os.path.join(path,filepath)
df = pd.read_csv(filename, sep='|')
print(df.head)
Another approach (in such situation when you have no access to the file or you have to deal with a list of string) can be wrap the list of string as a text file, then load normally using pandas
import pandas as pd
from io import StringIO
X = ["NAME|Contact|Education", "SMITH|12345|Graduate", "NITA|11111|Diploma"]
# Wrap the string list as a file of new line
DATA = StringIO("\n".join(X))
# Load as a pandas dataframe
df = pd.read_csv(DATA, delimiter="|")
Here the result

Related

How to create function to import CSV?

I'd like to create a function in Python to import CSV files from github, by just inputting the file name.
I tried the following code, but no dataframe is created. Can anyone give me a hand? Million thanks
import pandas as pd
def in_csv(file_name):
file = 'https://raw.githubusercontent.com/USER/file_name.csv'
file_name = pd.read_csv(file, header = 0)
in_csv('csv_name')
There are many ways to read à csv, but with à pandas.DataFrame:
import pandas as pd
def in_csv(file_name):
file_path = f'https://raw.githubusercontent.com/USER/{file_name}'
df = pd.read_csv(file_path, header = 0)
return df
df = in_csv('csv_name')
print(df.head())
Thanks #Alian and #Aaron
Just add '.csv' after {file_name} and work perfect.
import pandas as pd
def in_csv(file_name):
file_path = f'https://raw.githubusercontent.com/USER/{file_name}**.csv**'
df = pd.read_csv(file_path, header = 0)
return df
df = in_csv('csv_name')
In order to do this, you probably want to use a Python 3 F-string rather than a regular string. In this case, you would want to change your first line in the function to this:
file = f'https://raw.githubusercontent.com/USER/{file_name}.csv'
The f'{}' syntax uses the value of the variable or expression within the brackets instead of the string literal you included.

Create nested json lines from pipe delimited flat file using python

I have a text file pipe delimited as below. In that file for same ID, CODE and NUM combination we can have different INC and INC_DESC
ID|CODE|NUM|INC|INC_DESC
"F1"|"W1"|1|1001|"INC1001"
"F1"|"W1"|1|1002|"INC1002"
"F1"|"W1"|1|1003|"INC1003"
"F2"|"W1"|1|1002|"INC1003"
"F2"|"W1"|1|1003|"INC1004"
"F2"|"W2"|1|1003|"INC1003"
We want to create json like below where different INC and INC_DESC should come as an array for same combination of ID, CODE and NUM
{"ID":"F1","CODE":"W1","NUM":1,"INC_DTL":[{"INC":1001, "INC_DESC":"INC1001"},{"INC":1002, "INC_DESC":"INC1002"},{"INC":1003, "INC_DESC":"INC1003"}]}
{"ID":"F2","CODE":"W1","NUM":1,"INC_DTL":[{"INC":1002, "INC_DESC":"INC1002"},{"INC":1003, "INC_DESC":"INC1003"}]}
{"ID":"F2","CODE":"W2","NUM":1,"INC_DTL":[{"INC":1003, "INC_DESC":"INC1003"}]}
I tried below but it is not generating nested as I want
import pandas as pd
Input_File=f'V:\input.dat'
df=pd.read_csv(Input_File, sep='|')
json_output=f'V:\outfile.json'
output=df.to_json(json_output, orient='records')
import pandas as pd
# agg function
def agg_that(x):
l = [x]
return l
Input_File = f'V:\input.dat'
df = pd.read_csv(Input_File, sep='|')
# groupby columns
df = df.groupby(['ID', 'CODE', 'NUM']).agg(agg_that).reset_index()
# create new column
df['INC_DTL'] = df.apply(
lambda x: [{'INC': inc, 'INC_DESC': dsc} for inc, dsc in zip(x['INC'][0], x['INC_DESC'][0])], axis=1)
# drop old columns
df.drop(['INC', 'INC_DESC'], axis=1, inplace=True)
json_output = f'V:\outfile.json'
output = df.to_json(json_output, orient='records', lines=True)
OUTPUT:
{"ID":"F1","CODE":"W1","NUM":1,"INC_DTL":[{"INC":1001,"INC_DESC":"INC1001"},{"INC":1002,"INC_DESC":"INC1002"},{"INC":1003,"INC_DESC":"INC1003"}]}
{"ID":"F1","CODE":"W2","NUM":1,"INC_DTL":[{"INC":1003,"INC_DESC":"INC1003"}]}
{"ID":"F2","CODE":"W1","NUM":1,"INC_DTL":[{"INC":1002,"INC_DESC":"INC1003"},{"INC":1003,"INC_DESC":"INC1004"}]}

Create dataframe from a string Python

How do I create a dataframe from a string that look like this (part of the string)
,file_05,,\r\nx data,y
data\r\n-970.0,-34.12164,\r\n-959.0,-32.37526,\r\n-949.0,-30.360199,\r\n-938.0,-28.74816,\r\n-929.0,-27.53912,\r\n-920.0,-25.92707,\r\n-911.0,-24.31503,\r\n-900.0,-23.64334,\r\n-891.0,-22.29997,
Trying to make a dataframe that look like this
In the code below s is the string:
import pandas as pd
from io import StringIO
df = pd.read_csv(StringIO(s)).dropna(axis=1)
df.rename(columns={df.columns[0]: ""}, inplace=True)
By the way, if the string comes from a csv file then it is simpler to read the file directly using pd.read_csv.
Edit: This code will create a multiindex of columns:
import pandas as pd
from io import StringIO
df = pd.read_csv(StringIO(s), header = None).dropna(how="all", axis=1).T
df[0] = df.loc[1, 0]
df = df.set_index([0, 1]).T
Looks like you want a multi-level dataframe from the string. Here's how I would do it.
Step 1: Split the string by '\r\n'. Then for each value, split by
','
Step 2: The above step will create a list of list. Element #0 has 4
items and element #1 has 2 items. The rest have 3 items each and is
the actual data
Step 3: Convert the data into a dictionary from element #3 onwards.
Use values in element #2 as keys for the dictionary (namely x data
and y data). To ensure you have key:[list of values], use the
dict.setdefault(key,[]).append(value). This will ensure the data
is created as a `key:[list of values]' dictionary.
Step 4: Create a normal dataframe using the dictionary as all the
values are stored as key and values in the dictionary.
Step 5: Now that you have the dictionary, you want to create the
MultiIndex. Convert the column to MultiIndex.
Putting all this together, the code is:
import pandas as pd
text = ',file_05,,\r\nx data,y data\r\n-970.0,-34.12164,\r\n-959.0,-32.37526,\r\n-949.0,-30.360199,\r\n-938.0,-28.74816,\r\n-929.0,-27.53912,\r\n-920.0,-25.92707,\r\n-911.0,-24.31503,\r\n-900.0,-23.64334,\r\n-891.0,-22.29997,'
line_text = [txt.split(',') for txt in text.split('\r\n')]
dct = {}
for x,y,z in line_text[2:]:
dct.setdefault(line_text[1][0], []).append(x)
dct.setdefault(line_text[1][1], []).append(y)
df = pd.DataFrame(dct)
df.columns = pd.MultiIndex.from_tuples([(line_text[0][i],line_text[1][i]) for i in [0,1]])
print (df)
Output of this will be:
file_05
x data y data
0 -970.0 -34.12164
1 -959.0 -32.37526
2 -949.0 -30.360199
3 -938.0 -28.74816
4 -929.0 -27.53912
5 -920.0 -25.92707
6 -911.0 -24.31503
7 -900.0 -23.64334
8 -891.0 -22.29997
You should convert your raw data to a table with python.
Save to csv file by import csv package with python.
from pandas import DataFrame
# s is raw datas
s = ",file_05,,\r\nx data,y data\r\n-970.0,-34.12164,\r\n-959.0,-32.37526,\r\n-949.0,-30.360199,\r\n-938.0,-28.74816,\r\n-929.0,-27.53912,\r\n-920.0,-25.92707,\r\n-911.0,-24.31503,\r\n-900.0,-23.64334,\r\n-891.0,-22.29997,"
# convert raw data to a table
table = [i.split(',') for i in s.split("\r\n")]
table = [i[:2] for i in table]
# table is like
"""
[['', 'file_05'],
['x data', 'y data'],
['-970.0', '-34.12164'],
['-959.0', '-32.37526'],
['-949.0', '-30.360199'],
...
['-891.0', '-22.29997']]
"""
# save to output.csv file
import csv
with open('output.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(table)
# Save to DataFrame df
from pandas import DataFrame
df = DataFrame (table[2:],columns=table[1][:2])
print(df)

How to read multiple json files into pandas dataframe?

I'm having a hard time loading multiple line delimited JSON files into a single pandas dataframe. This is the code I'm using:
import os, json
import pandas as pd
import numpy as np
import glob
pd.set_option('display.max_columns', None)
temp = pd.DataFrame()
path_to_json = '/Users/XXX/Desktop/Facebook Data/*'
json_pattern = os.path.join(path_to_json,'*.json')
file_list = glob.glob(json_pattern)
for file in file_list:
data = pd.read_json(file, lines=True)
temp.append(data, ignore_index = True)
It looks like all the files are loading when I look through file_list, but cannot figure out how to get each file into a dataframe. There are about 50 files with a couple lines in each file.
Change the last line to:
temp = temp.append(data, ignore_index = True)
The reason we have to do this is because the append doesn't happen in place. The append method does not modify the data frame. It just returns a new data frame with the result of the append operation.
Edit:
Since writing this answer I have learned that you should never use DataFrame.append inside a loop because it leads to quadratic copying (see this answer).
What you should do instead is first create a list of data frames and then use pd.concat to concatenate them all in a single operation. Like this:
dfs = [] # an empty list to store the data frames
for file in file_list:
data = pd.read_json(file, lines=True) # read data frame from json file
dfs.append(data) # append the data frame to the list
temp = pd.concat(dfs, ignore_index=True) # concatenate all the data frames in the list.
This alternative should be considerably faster.
If you need to flatten the JSON, Juan Estevez’s approach won’t work as is. Here is an alternative :
import pandas as pd
dfs = []
for file in file_list:
with open(file) as f:
json_data = pd.json_normalize(json.loads(f.read()))
dfs.append(json_data)
df = pd.concat(dfs, sort=False) # or sort=True depending on your needs
Or if your JSON are line-delimited (not tested) :
import pandas as pd
dfs = []
for file in file_list:
with open(file) as f:
for line in f.readlines():
json_data = pd.json_normalize(json.loads(line))
dfs.append(json_data)
df = pd.concat(dfs, sort=False) # or sort=True depending on your needs
from pathlib import Path
import pandas as pd
paths = Path("/home/data").glob("*.json")
df = pd.DataFrame([pd.read_json(p, typ="series") for p in paths])
I combined Juan Estevez's answer with glob. Thanks a lot.
import pandas as pd
import glob
def readFiles(path):
files = glob.glob(path)
dfs = [] # an empty list to store the data frames
for file in files:
data = pd.read_json(file, lines=True) # read data frame from json file
dfs.append(data) # append the data frame to the list
df = pd.concat(dfs, ignore_index=True) # concatenate all the data frames in the list.
return df
Maybe you should state, if the json files are created themselves with pandas pd.to_json() or in another way.
I used data which was not created with pd.to_json() and I think it is not possible to use pd.read_json() in my case. Instead, I programmed a customized for-each loop approach to write everything to the DataFrames

adding columns to dataframe based on file name in python

I have several txt files with different file names. I would like to do two things:
1.) Load the data all at once
2.) using partial parts from the file name and add it for the dedicated dataframe as additional column
3.) adding the files together
I have below a really really manual example but want to automize it somehow. How is that possible?
The code looks like the following
import pandas as pd
#load data files
data1 = pd.read_csv('C:/file1_USA_Car_1d.txt')
data2 = pd.read_csv('C:/file2_USA_Car_2d.txt')
data3 = pd.read_csv('C:/file3_USA_Car_1m.txt')
data4 = pd.read_csv('C:/file3_USA_Car_6m.txt')
data5 = pd.read_csv('C:file3_USA_Car_1Y.txt')
df = pd.DataFrame()
print(df)
df = data1
#--> The input for the column below should be taken from the name of the file
df['country'] = 'USA'
df['Type'] = 'Car'
df['duration'] = '1d'
print(df)
Iterate over your files with glob and do some simple splitting on the filenames.
import glob
import pandas as pd
df_list = []
for file in glob.glob('C:/file1_*_*_*.txt'):
# Tweak this to work for your actual filepaths, if needed.
country, typ, dur = file.split('.')[0].split('_')[1:]
df = (pd.read_csv(file)
.assign(Country=country, Type=typ, duration=dur))
df_list.append(df)
df = pd.concat(df_list)
One way of doing it would be to do this:
all_res = pd.DataFrame()
file_list = ['C:/file1_USA_Car_1d.txt', 'C:/file3_USA_Car_1m.txt', 'etc']
for file_name in file_list:
tmp = pd.read_csv(file_name)
tmp['file_name'] = file_name
all_res = all_res.append(tmp)
all_res = all_res.reset_index()
I'd do something like the following:
from pathlib import Path
from operator import itemgetter
import pandas as pd
file_paths = [
Path(path_str)
for path_str in (
'C:/file1_USA_Car_1d.txt', 'C:/file2_USA_Car_2d.txt',
'C:/file3_USA_Car_1m.txt', 'C:/file3_USA_Car_6m.txt',
'C:file3_USA_Car_1Y.txt')
]
def import_csv(csv_path):
df = pd.read_csv(csv_path)
df['country'], df['Type'], df['duration'] = itemgetter(1, 2, 3)(csv_path.stem.split('_'))
return df
dfs = [import_csv(csv_path) for csv_path in file_paths]
This helps encapsulate your desired behavior in a helper function and reduces the things you need to think about.

Categories

Resources