Addition of a column into a dataframe - python

import pandas as pd
import numpy as np
import feather
import glob
path = r'C:/Users/user1/Desktop/Test' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for i,filename in enumerate (all_files):
df = pd.read_csv(filename, ',' ,index_col=None, header=0).assign(user_iD=filename)
li.append(df)
data = pd.concat(li, axis=0, ignore_index=True)
df = data.copy()
df.to_feather('KT2test.ftr')
data1= pd.read_feather('KT2test.ftr')
data1.tail(50)
The output I'm getting in the user_iD column is C:/Users/user1/Desktop/Test\u9.csv
Although I only want user_id as u9 or only 9
How to get this done?

df = pd.read_csv(filename, ',' ,index_col=None, header=0).assign(user_iD=filename.split("\\")[-1].split(".")[0])

df = df.assign(user_iD=filename.split("\\u")[-1].split(".")[0])

Related

How to run function on multiple dataframes of variable row sizes, then generate a new dataframe with just the function results

I have a folder full of CSVs of equal columns but variable rows. I want to convert each to a dataframe and run a simple function on them, and create one new dataframe with just the function values and the file names as the index.
So far I have:
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
pd.concat(df2[df.index == 'total'])
df.to_csv('file_path')
I'm sure there are several ways in which this is messed up, but any advice is appreciated
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
dfs = []
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
# Would remove the .loc, but it does no harm
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
dfs.append(df[['total']])
df_total = pd.concat(dfs).reset_index(drop=True)
df_total.to_csv('file_path')
OK I figured it out:
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
filename = pd.DataFrame(columns=['Filename'])
filename['Filename'] = pd.Series([file for file in files]).reset_index(drop=True)
dfs = []
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
# Would remove the .loc, but it does no harm
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
dfs.append(df)
dfs = pd.concat(dfs)
total = dfs[dfs.index == 'total'][['dfcolumn1','dfcolumn2',etc]]#write column names exactly as they appear on csv
total_named = filename.join(total.set_index(filename.index))
total_named.to_csv('file_path')

Multiple csv not being added to pandas

I hope you can help me with this problem.
I am having issues with adding multiple CSV files in pandas.
I have 12 files of sales data that have the same columns (one for each month: Sales_January_2019, Sales_February_2019.... and so on until December).
I've tried the following code but seems not working, also the index number should be continuous and not reset after each file. I tried with reset_index() but also didn't work.
import pandas as pd
import glob
path = r'C:\Users\ricar\.spyder-py3\data' # my path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
df = pd.read_csv(filename, index_col=0, header=0)
li.append(df)
df.reset_index(inplace=True)
frame = pd.concat(li, axis=0, ignore_index=True)
df.drop(columns = ['x_t', 'perf'], inplace=True)
print(df)
Try correcting your code like this:
import pandas as pd
import glob
path = r'C:\Users\ricar\.spyder-py3\data' # my path
files = glob.glob(path + "/*.csv")
# Make a list of dataframes
li = [pd.read_csv(file, index_col=0, header=0) for file in files]
# Concatenate dataframes and remove useless columns
df = pd.concat(li, axis=0, ignore_index=True)
df.drop(columns=["x_t", "perf"], inplace=True)
print(df)

What does "TypeError: 'generator' object does not support item assignment" mean?

When I try to run the following code I get an error called: TypeError: 'generator' object does not support item assignment. How can I fix this?
import os, glob
import pandas as pd
import re
import sys
path = r'C:\Users\Nicole\02_Datenverarbeitung und Analyse\Input'
all_files = glob.glob(os.path.join(path, "scrapeddata*.csv"))
for files in all_files:
basic_file_name = files.replace(path, '')
date = basic_file_name[13:22]
df_from_each_file = (pd.read_csv(f, sep=',', encoding='iso-8859-1', error_bad_lines=False, warn_bad_lines=False) for f in all_files)
df_from_each_file['date'] = 'date'
df_merged = pd.concat(df_from_each_file, ignore_index=True)
df_merged
Try this:
...
# Use brackets instead of parenthesis
df_from_each_file = [
pd.read_csv(
f, sep=",", encoding="iso-8859-1", error_bad_lines=False, warn_bad_lines=False
)
for f in all_files
]
# Assign new column to each df
for df in df_from_each_file:
df["date"] = "date"
# Concat dataframes
df_merged = pd.concat(df_from_each_file, ignore_index=True)

Need to pick 'second column' from multiple csv files and save all 'second columns' in one csv file

So I have 366 CSV files and I want to copy their second columns and write them into a new CSV file. Need a code for this job. I tried some codes available here but nothing works. please help.
Assuming all the 2nd columns are the same length, you could simply loop through all the files. Read them, save the 2nd column to memory and construct a new df along the way.
filenames = ['test.csv', ....]
new_df = pd.DataFrame()
for filename in filenames:
df = pd.read_csv(filename)
second_column = df.iloc[:, 1]
new_df[f'SECOND_COLUMN_{filename.upper()}'] = second_column
del(df)
new_df.to_csv('new_csv.csv', index=False)
filenames = glob.glob(r'D:/CSV_FOLDER' + "/*.csv")
new_df = pd.DataFrame()
for filename in filenames:
df = pd.read_csv(filename)
second_column = df.iloc[:, 1]
new_df[f'SECOND_COLUMN_{filename.upper()}'] = second_column
del(df)
new_df.to_csv('new_csv.csv', index=False)
This can accomplished with glob and pandas:
import glob
import pandas as pd
mylist = [f for f in glob.glob("*.csv")]
df = pd.read_csv(mylist[0]) #create the dataframe from the first csv
df = pd.DataFrame(df.iloc[:,1]) #only keep 2nd column
for x in mylist[1:]: #loop through the rest of the csv files doing the same
t = pd.read_csv(x)
colName = pd.DataFrame(t.iloc[:,1]).columns
df[colName] = pd.DataFrame(t.iloc[:,1])
df.to_csv('output.csv', index=False)
import glob
import pandas as pd
mylist = [f for f in glob.glob("*.csv")]
df = pd.read_csv(csvList[0]) #create the dataframe from the first csv
df = pd.DataFrame(df.iloc[:,0]) #only keep 2nd column
for x in mylist[1:]: #loop through the rest of the csv files doing the same
t = pd.read_csv(x)
colName = pd.DataFrame(t.iloc[:,0]).columns
df[colName] = pd.DataFrame(t.iloc[:,0])
df.to_csv('output.csv', index=False)

Get file created date - add to dataframes column on read_csv

I need to pull many (hundreds) CSV's into a pandas dataframe. I need to a add the date the file was created in a column upon read in to the pandas dataframe for each CSV file. I can obtain the date of creation for a CSV file using this call:
time.strftime('%m/%d/%Y', time.gmtime(os.path.getmtime('/path/file.csv')))
As an fyi, this is the command I am using to read in the CSVs:
path1 = r'/path/'
all_files_standings = glob.glob(path1 + '/*.csv')
standings = pd.concat((pd.read_csv(f, low_memory=False, usecols=[7, 8, 9]) for f in standings))
I tried running this call (which worked):
dt_gm = [time.strftime('%m/%d/%Y', time.gmtime(os.path.getmtime('/path/file.csv')))]
So then I tried expanding it:
dt_gm = [time.strftime('%m/%d/%Y', time.gmtime(os.path.getmtime(f) for f in all_files_standings))]
and I get this error:
TypeError: an integer is required (got type generator)
How can I resolve this?
if the different files have the same columns and you would like to append different files into rows.
import pandas as pd
import time
import os
# lis of files you want to read
files = ['one.csv', 'two.csv']
column_names = ['c_1', 'c_2', 'c_3']
all_dataframes = []
for file_name in files:
df_temp = pd.read_csv(file_name, delimiter=',', header=None)
df_temp.columns = column_names
df_temp['creation_time'] = time.strftime('%m/%d/%Y', time.gmtime(os.path.getmtime(file_name)))
df_temp['file_name'] = file_name
all_dataframes.append(df_temp)
df = pd.concat(all_dataframes, axis=0, ignore_index=True)
df
output:
if you want to append the different files by columns:
all_dataframes = []
for idx, file_name in enumerate(files):
df_temp = pd.read_csv(file_name, delimiter=',', header=None)
column_prefix = 'f_' + str(idx) + '_'
df_temp.columns = [column_prefix + c for c in column_names]
df_temp[column_prefix + 'creation_time'] = time.strftime('%m/%d/%Y', time.gmtime(os.path.getmtime(file_name)))
all_dataframes.append(df_temp)
pd.concat(all_dataframes, axis=1)
output:

Categories

Resources