I would need your help regarding a problem in reading files.
I have some csv files which use a different delimiter (;) instead of ,. In general, for those cases, I do as follows:
pd.read_csv('path/filename.csv', sep=';', engine='python')
and for those with no issues:
pd.read_csv('path/filename.csv')
Since I have a list of files, I do not know which one is causing the error, so I would need to edit a bit the code below, in order to include both cases if an error occurs.
The current error is:
ParserError: Error tokenizing data. C error: Expected 3 fields in line 9, saw 9
The code that I need to edit to include the conditions above is the following:
import pandas as pd
from pathlib import Path
from os.path import join
import matplotlib.pyplot as plt
import glob
def create_dataset():
country='UK'
base_path = Path(''+country)
glob_pattern = str(base_path.joinpath("*.csv"))
all_csv_filenames = glob.glob(glob_pattern)
dataframes = {
(str(Path(filename).parent), str(Path(filename).stem)): pd.read_csv(
filename, sep=','
)
for filename in all_csv_filenames
}
data = pd.concat(dataframes, names=['Country', 'FileName', '_'],)
return data
Thank you for your help
You could use try and except, sadly, AFAIK, no way of doing that in a comprehension, so use a regular for loop, something along these lines:
import pandas as pd
from pathlib import Path
from os.path import join
import matplotlib.pyplot as plt
import glob
def create_dataset():
country='UK'
base_path = Path(''+country)
glob_pattern = str(base_path.joinpath("*.csv"))
all_csv_filenames = glob.glob(glob_pattern)
dataframes ={}
for filename in all_csv_filenames:
try:
v = pd.read_csv(filename, sep=',')
except ParserError:
v = pd.read_csv(filename, sep=';')
dataframes[(str(Path(filename).parent), str(Path(filename).stem))] = v
data = pd.concat(dataframes, names=['Country', 'FileName', '_'],)
return data
If you change all instances of sep to delimiter, it should work:
import pandas as pd
from pathlib import Path
from os.path import join
import matplotlib.pyplot as plt
import glob
def create_dataset():
country='UK'
base_path = Path(''+country)
glob_pattern = str(base_path.joinpath("*.csv"))
all_csv_filenames = glob.glob(glob_pattern)
dataframes = {
(str(Path(filename).parent), str(Path(filename).stem)): pd.read_csv(
filename, delimiter=','
)
for filename in all_csv_filenames
}
data = pd.concat(dataframes, names=['Country', 'FileName', '_'],)
return data
Related
The only thing I added was the attempted concatenation of the file path.
I am getting an 'unexpected character after line continuation character, and cannot figure out why.
import numpy as np
import pandas as pd
import getpass
user = getpass.getuser()
data = pd.read_excel (r'C:\Users\' + user + '\Desktop\bulk export.xlsx',
sheet_name=1,
header=0)
df = pd.DataFrame(data,
columns= [1,'Record Type'])
print (df)
You can try this:
import pathlib
from pathlib import Path
user = getpass.getuser()
my_file = Path(f"C:\\Users\\{user}\\Desktop\\bulk export.xlsx")
data = pd.read_excel (my_file, sheet_name=1, header=0)
I have collection of excel files containing similar datasets. I want it to be read by different Pandas dataframes.
import glob
import pandas as pd
path=r"C:users/me/desktop/ExcelData"
files=glob.glob('*.xls')
for f in files:
df[f]=pd.read_excel(f)
import glob
import pandas as pd
import os
path=r"C:\\users\\me\\desktop\\ExcelData\\"
csv_files = glob.glob(os.path.join(path, "*.xls"))
dfl=[]
for f in csv_files:
x= pd.read_excel(f)
dfl.append(x)
import pandas as pd
import os
import glob
path = r"C:users/me/desktop/ExcelData"
files = glob.glob(path + "\*.xls")
finalexcelsheet = pd.DataFrame()
for file in files:
df = pd.concat(pd.read_excel(file, sheet_name = None), ignore_index=True,
sort=False)
finalexcelsheet = finalexcelsheet.append(df, ignore_index = True)
print(finalexcelsheet)
What's the best way to create a pandas dataframe from one file with any file name located in a specified folder?
I have used pathlib and it's not quite working as the output dataframe is not giving me anything.
from pathlib import Path
import pandas as pd
pth = r'C:\Users\HP\Desktop\IBM\New folder'
fle = Path(pth).glob('*.tsv')
someDf = pd.DataFrame(fle)
someDf
Edit:
I also tried doing the below, but the output dataframe combines all columns into one column separated by a backward slash. How do I fix this?
from pathlib import Path
import pandas as pd
pth = r'C:\Users\HP\Desktop\IBM\New folder'
fle = Path(pth).glob('*.tsv')
dfs = []
for filename in fle:
dfs.append(pd.read_csv(filename))
dfs1 = pd.concat(dfs)
dfs1.head()
The way I did this seems complicated. Is there an easier way to do this?
Please try:
from pathlib import Path
import pandas as pd
import os
pth = r'C:\Users\HP\Desktop\IBM\New folder'
for file_ in os.listdir(pth):
h=os.path.join(pth, file_)
#print (h)
someDf = pd.read_csv(h)
someDf
Try
from glob import glob
files = glob('C:\Users\HP\Desktop\IBM\New folder\*.tsv')
if len(files) == 1:
dfs = pd.read_csv(files[0], sep='\t')
else:
dfs = pd.concat([pd.read_csv(file, sep='\t') for file in files])
The solution I found for this is as below. I missed the sep parameter in pd.read_csv().
from pathlib import Path
import pandas as pd
pth = r'C:\Users\HP\Desktop\IBM\New folder'
fle = Path(pth).glob('*.tsv')
dfs = []
for filename in fle:
dfs.append(pd.read_csv(filename, sep='\t'))
dfs1 = pd.concat(dfs)
dfs1.head()
I am trying to iterate through json files in a folder and append them all into one pandas dataframe.
If I say
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
import os
directory_in_str = 'building_data'
directory = os.fsencode(directory_in_str)
df_all = pd.DataFrame()
with open("building_data/rooms.json") as file:
data = json.load(file)
df = json_normalize(data['rooms'])
df_y.append(df, ignore_index=True)
I get a dataframe with the data from the one file. If I turn this thinking into a for loop, I have tried
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
import os
directory_in_str = 'building_data'
directory = os.fsencode(directory_in_str)
df_all = pd.DataFrame()
for file in os.listdir(directory):
with open(directory_in_str+'/'+filename) as file:
data = json.load(file)
df = json_normalize(data['rooms'])
df_all.append(df, ignore_index=True)
print(df_all)
This returns an empty dataframe. Does anyone know why this is happening? If I print df before appending it, it prints the correct values, so I am not sure why it is not appending.
Thank you!
Instead of append next DataFrame I would try to join them like that:
if df_all.empty:
df_all = df
else:
df_all = df_all.join(df)
When joining DataFrames, you can specify on what they should be joined - on index or on specific (key) column, as well as how (default option is similar to appending - 'left').
Here's docs about pandas.DataFrame.join.
In these instances I load everything from json into a list by appending each file's returned dict onto that list. Then I pass the list to pandas.DataFrame.from_records (docs)
In this case the source would become something like...
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
import os
directory_in_str = 'building_data'
directory = os.fsencode(directory_in_str)
json_data = []
for file in os.listdir(directory):
with open(directory_in_str+'/'+filename) as file:
data = json.load(file)
json_data.append( json_normalize(data['rooms']) )
df_all = pandas.DataFrame.from_records( json_data )
print(df_all)
I am currently trying to change the headings of the file I am creating. The code I am using is as follows;
import pandas as pd
import os, sys
import glob
path = "C:\\Users\\cam19\\Desktop\\Test1\\*.csv"
list_=[]
for fname in glob.glob(path):
df = pd.read_csv(fname, dtype=None, low_memory=False)
output = (df['logid'].value_counts())
list_.append(output)
df1 = pd.DataFrame()
df2 = pd.concat(list_, axis=1)
df2.to_csv('final.csv')
Basically I am looping through a file directory and extracting data from each file. Using this is outputs the following image;
http://imgur.com/a/LE7OS
All i want to do it change the columns names from 'logid' to the file name it is currently searching but I am not sure how to do this. Any help is great! Thanks.
Instead of appending the values try to append values by creating the dataframe and setting the column i.e
output = pd.DataFrame(df['value'].value_counts())
output.columns = [os.path.basename(fname).split('.')[0]]
list_.append(output)
Changes in the code in the question
import pandas as pd
import os, sys
import glob
path = "C:\\Users\\cam19\\Desktop\\Test1\\*.csv"
list_=[]
for fname in files:
df = pd.read_csv(fname)
output = pd.DataFrame(df['value'].value_counts())
output.columns = [os.path.basename(fname).split('.')[0]]
list_.append(output)
df2 = pd.concat(list_, axis=1)
df2.to_csv('final.csv')
Hope it helps