How to use variable name within dataframe path - python

The only thing I added was the attempted concatenation of the file path.
I am getting an 'unexpected character after line continuation character, and cannot figure out why.
import numpy as np
import pandas as pd
import getpass
user = getpass.getuser()
data = pd.read_excel (r'C:\Users\' + user + '\Desktop\bulk export.xlsx',
sheet_name=1,
header=0)
df = pd.DataFrame(data,
columns= [1,'Record Type'])
print (df)

You can try this:
import pathlib
from pathlib import Path
user = getpass.getuser()
my_file = Path(f"C:\\Users\\{user}\\Desktop\\bulk export.xlsx")
data = pd.read_excel (my_file, sheet_name=1, header=0)

Related

How to run function on multiple dataframes of variable row sizes, then generate a new dataframe with just the function results

I have a folder full of CSVs of equal columns but variable rows. I want to convert each to a dataframe and run a simple function on them, and create one new dataframe with just the function values and the file names as the index.
So far I have:
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
pd.concat(df2[df.index == 'total'])
df.to_csv('file_path')
I'm sure there are several ways in which this is messed up, but any advice is appreciated
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
dfs = []
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
# Would remove the .loc, but it does no harm
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
dfs.append(df[['total']])
df_total = pd.concat(dfs).reset_index(drop=True)
df_total.to_csv('file_path')
OK I figured it out:
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
filename = pd.DataFrame(columns=['Filename'])
filename['Filename'] = pd.Series([file for file in files]).reset_index(drop=True)
dfs = []
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
# Would remove the .loc, but it does no harm
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
dfs.append(df)
dfs = pd.concat(dfs)
total = dfs[dfs.index == 'total'][['dfcolumn1','dfcolumn2',etc]]#write column names exactly as they appear on csv
total_named = filename.join(total.set_index(filename.index))
total_named.to_csv('file_path')

Pandas generating an empty csv while trying combine all csv's into one csv

I am writing a python script that will read all the csv files in the current location and merge them into a single csv file. Below is my code:-
import os
import numpy as np
import pandas as pd
import glob
path = os.getcwd()
extension = csv
os.chdir(path)
tables = glob.glob('*.{}'.format(extension))
data = pd.DataFrame()
for i in tables:
try:
df = pd.read_csv(r''+path+'/'+i+'')
# Here I want to create an index column with the name of the file and leave that column empty
df[i] = np.NaN
df.set_index(i, inplace=True)
# Below line appends an empty row for easy differentiation
df.loc[df.iloc[-1].name+1,:] = np.NaN
data = data.append(df)
except Exception as e:
print(e)
data.to_csv('final_output.csv', indexx=False, header=None)
If I remove the below lines of code then it works:-
df[i] = np.NaN
df.set_index(i, inplace=True)
But I want to have the first column name as the name of the file and its values NaN or empty.
I want the output to look something like this:-
I tend to avoid the .append method in favor of pandas.concat
Try this:
import os
from pathlib import Path
import pandas as pd
files = Path(os.getcwd()).glob('*.csv')
df = pd.concat([
pd.read_csv(f).assign(filename=f.name)
for f in files
], ignore_index=True)
df.to_csv('alldata.csv', index=False)

Error in reading csv files because of delimiter

I would need your help regarding a problem in reading files.
I have some csv files which use a different delimiter (;) instead of ,. In general, for those cases, I do as follows:
pd.read_csv('path/filename.csv', sep=';', engine='python')
and for those with no issues:
pd.read_csv('path/filename.csv')
Since I have a list of files, I do not know which one is causing the error, so I would need to edit a bit the code below, in order to include both cases if an error occurs.
The current error is:
ParserError: Error tokenizing data. C error: Expected 3 fields in line 9, saw 9
The code that I need to edit to include the conditions above is the following:
import pandas as pd
from pathlib import Path
from os.path import join
import matplotlib.pyplot as plt
import glob
def create_dataset():
country='UK'
base_path = Path(''+country)
glob_pattern = str(base_path.joinpath("*.csv"))
all_csv_filenames = glob.glob(glob_pattern)
dataframes = {
(str(Path(filename).parent), str(Path(filename).stem)): pd.read_csv(
filename, sep=','
)
for filename in all_csv_filenames
}
data = pd.concat(dataframes, names=['Country', 'FileName', '_'],)
return data
Thank you for your help
You could use try and except, sadly, AFAIK, no way of doing that in a comprehension, so use a regular for loop, something along these lines:
import pandas as pd
from pathlib import Path
from os.path import join
import matplotlib.pyplot as plt
import glob
def create_dataset():
country='UK'
base_path = Path(''+country)
glob_pattern = str(base_path.joinpath("*.csv"))
all_csv_filenames = glob.glob(glob_pattern)
dataframes ={}
for filename in all_csv_filenames:
try:
v = pd.read_csv(filename, sep=',')
except ParserError:
v = pd.read_csv(filename, sep=';')
dataframes[(str(Path(filename).parent), str(Path(filename).stem))] = v
data = pd.concat(dataframes, names=['Country', 'FileName', '_'],)
return data
If you change all instances of sep to delimiter, it should work:
import pandas as pd
from pathlib import Path
from os.path import join
import matplotlib.pyplot as plt
import glob
def create_dataset():
country='UK'
base_path = Path(''+country)
glob_pattern = str(base_path.joinpath("*.csv"))
all_csv_filenames = glob.glob(glob_pattern)
dataframes = {
(str(Path(filename).parent), str(Path(filename).stem)): pd.read_csv(
filename, delimiter=','
)
for filename in all_csv_filenames
}
data = pd.concat(dataframes, names=['Country', 'FileName', '_'],)
return data

How to create a pandas dataframe from one file (with any file name) located in a specified folder?

What's the best way to create a pandas dataframe from one file with any file name located in a specified folder?
I have used pathlib and it's not quite working as the output dataframe is not giving me anything.
from pathlib import Path
import pandas as pd
pth = r'C:\Users\HP\Desktop\IBM\New folder'
fle = Path(pth).glob('*.tsv')
someDf = pd.DataFrame(fle)
someDf
Edit:
I also tried doing the below, but the output dataframe combines all columns into one column separated by a backward slash. How do I fix this?
from pathlib import Path
import pandas as pd
pth = r'C:\Users\HP\Desktop\IBM\New folder'
fle = Path(pth).glob('*.tsv')
dfs = []
for filename in fle:
dfs.append(pd.read_csv(filename))
dfs1 = pd.concat(dfs)
dfs1.head()
The way I did this seems complicated. Is there an easier way to do this?
Please try:
from pathlib import Path
import pandas as pd
import os
pth = r'C:\Users\HP\Desktop\IBM\New folder'
for file_ in os.listdir(pth):
h=os.path.join(pth, file_)
#print (h)
someDf = pd.read_csv(h)
someDf
Try
from glob import glob
files = glob('C:\Users\HP\Desktop\IBM\New folder\*.tsv')
if len(files) == 1:
dfs = pd.read_csv(files[0], sep='\t')
else:
dfs = pd.concat([pd.read_csv(file, sep='\t') for file in files])
The solution I found for this is as below. I missed the sep parameter in pd.read_csv().
from pathlib import Path
import pandas as pd
pth = r'C:\Users\HP\Desktop\IBM\New folder'
fle = Path(pth).glob('*.tsv')
dfs = []
for filename in fle:
dfs.append(pd.read_csv(filename, sep='\t'))
dfs1 = pd.concat(dfs)
dfs1.head()

Changing Column Heading CSV File

I am currently trying to change the headings of the file I am creating. The code I am using is as follows;
import pandas as pd
import os, sys
import glob
path = "C:\\Users\\cam19\\Desktop\\Test1\\*.csv"
list_=[]
for fname in glob.glob(path):
df = pd.read_csv(fname, dtype=None, low_memory=False)
output = (df['logid'].value_counts())
list_.append(output)
df1 = pd.DataFrame()
df2 = pd.concat(list_, axis=1)
df2.to_csv('final.csv')
Basically I am looping through a file directory and extracting data from each file. Using this is outputs the following image;
http://imgur.com/a/LE7OS
All i want to do it change the columns names from 'logid' to the file name it is currently searching but I am not sure how to do this. Any help is great! Thanks.
Instead of appending the values try to append values by creating the dataframe and setting the column i.e
output = pd.DataFrame(df['value'].value_counts())
output.columns = [os.path.basename(fname).split('.')[0]]
list_.append(output)
Changes in the code in the question
import pandas as pd
import os, sys
import glob
path = "C:\\Users\\cam19\\Desktop\\Test1\\*.csv"
list_=[]
for fname in files:
df = pd.read_csv(fname)
output = pd.DataFrame(df['value'].value_counts())
output.columns = [os.path.basename(fname).split('.')[0]]
list_.append(output)
df2 = pd.concat(list_, axis=1)
df2.to_csv('final.csv')
Hope it helps

Categories

Resources