I am trying to store the output of if condition to a Dataframe. Given below is what I am trying:
import os
filename = "Desktop/sales/10-05-2018"
#check file exists
if(os.path.exists(filename)):
print("Files received")
else:
print("No files received")
Instead of printing the output, I would like to store the output to a Dataframe. Could anyone advice on this. Thanks.
This is one way you can store such a mapping in a dataframe.
import os, pandas as pd
df = pd.DataFrame(columns=['filename', 'exists'])
df['file'] = ['file1.csv', 'file2.csv', 'file3.csv']
df['exists'] = df['file'].map(os.path.exists)
This will create a dataframe of filenames in one column and a Boolean series in another indicating whether or not the file exists.
If the filenames are retrieved from an iterable, you should aggregate to a list of lists first before constructing a dataframe. Appending continually to an existing dataframe is inefficient in this situation.
lst = ( ... some iterable ... )
lst_of_lst = [[f, os.path.exists(f)] for f in lst]
df = pd.DataFrame(lst_of_lst, columns=['filename', 'exists'])
Related
I have a list of files to be imported into the data frame
cdoe:
# list contains the dataset name followed by the column name to match all the datasets; this list keeps changing and even the file formats. These dataset file names are provided by the user, and they are unique.
# First: find the file extension format and select appropriate pd.read_ to import
# second: merge the dataframes on the index
# in the below list,
file_list = ['dataset1.csv','datetime','dataset2.xlsx','timestamp']
df = pd.DataFrame()
for i in range(0:2:len(file_list)):
# find the file type first
# presently, I don't know how to find the file type; so
file_type = 'csv'
# second: merge the dataframe into the existing dataframe on the index
tdf = pd.DataFrame()
if file_type == 'csv':
tdf = pd.read_csv('%s'%(file_list[i])))
if file_type == 'xlsx':
tdf = pd.read_excel('%s'%(file_list[i])))
tdf.set_index('%s'%(file_list[i+1]),inplace=True)
# Merge dataframe with the existing dataframe
df = df.merge(tdf,right_index=True,left_index=True)
I reached this far. Is any direct module available to find the file type? I found magic but it has issues while importing it. Also, suggest a better approach to merge the files?
Update: Working solution
Inspired from the #ljdyer answer below, I came with the following and this is working perfectly:
def find_file_type_import(file_name):
# Total file extensions possible for importing data
file_type = {'csv':'pd.read_csv(file_name)',
'xlsx':'pd.read_excel(file_name)',
'txt':'pd.read_csv(file_name)',
'parquet':'pd.read_parquet(file_name)',
'json':'pd.read_json(file_name)'
}
df = [eval(val) for key,val in file_type.items() if file_name
.endswith(key)][0]
return df
df = find_file_type_import(file_list [0])
This is working perfectly. Thank you for your valuable suggestions. ALso, correct me with the use of eval is good one or not?
The file type is just the three or four letters at the end of the file name, so the simplest way to do this would just be:
if file_list[i].endswith('csv'):
etc.
Other commons options would be os.path.splitext or the suffix attribute of a Path object from the built-in os and pathlib libraries respectively.
The way you are merging looks fine, but I'm not sure why you are using percent notation for the parameters to read_, set_index, etc. The elements of your list are just strings anyway, so for example
tdf = pd.read_csv('%s'%(file_list[i])))
could just be:
tdf = pd.read_csv(file_list[i])
(Answer to follow-up question)
Really nice idea to use a dict! It is generally considered good practice to avoid eval wherever possible, so here's an alternative option with the pandas functions themselves as dictionary values. I also suggest a prettier syntax for your list comprehension with exactly one element based on this answer and some clearer variable names:
def find_file_type_import(file_name):
# Total file extensions possible for importing data
read_functions = {'csv': pd.read_csv,
'xlsx': pd.read_excel,
'txt': pd.read_csv,
'parquet': pd.read_parquet,
'json': pd.read_json}
[df] = [read(file_name) for file_ext, read in read_functions.items()
if file_name.endswith(file_ext)]
return df
You can use glob (or even just os) to retrieve the list of files from a part of their name. Since you guarantee the uniqueness of the file irrespective of the extension, it will only be one (otherwise just put a loop that iterates over the retrieved elements).
Once you have the full file name (which clearly has the extension), just do a split() taking the last element obtained that corresponds to the file extension.
Then, you can read the dataframe with the appropriate function.
Here is an example of code:
from glob import glob
file_list = [
'dataset0', # corresponds to dataset0.csv
'dataset1', # corresponds to dataset1.xlsx
'dataset2.a'
]
for file in file_list:
files_with_curr_name = glob(f'*{file}*')
if len(files_with_curr_name) > 0:
full_file_name = files_with_curr_name[0] # take the first element, the uniqueness of the file name being guaranteed
# extract the file extension (string after the dot, so the last element of split)
file_type = full_file_name.split(".")[-1]
if file_type == 'csv':
print(f'Read {full_file_name} as csv')
# df = pd.read_csv(full_file_name)
elif file_type == 'xlsx':
print(f'Read {full_file_name} as xlsx')
else:
print(f"Don't read {full_file_name}")
Output will be:
Read dataset0.csv as csv
Read dataset1.xlsx as xlsx
Don't read dataset2.a
Using pathlib and a switch dict to call functions.
from pathlib import Path
import pandas as pd
def main(files: list) -> None:
caller = {
".csv": read_csv,
".xlsx": read_excel,
".pkl": read_pickle
}
for file in get_path(files):
print(caller.get(file.suffix)(file))
def get_path(files: list) -> list:
file_path = [x for x in Path.home().rglob("*") if x.is_file()]
return [x for x in file_path if x.name in files]
def read_csv(file: Path) -> pd.DataFrame:
return pd.read_csv(file)
def read_excel(file: Path) -> pd.DataFrame:
return pd.read_excel(file)
def read_pickle(file: Path) -> pd.DataFrame:
return pd.read_pickle(file)
if __name__ == "__main__":
files_to_read = ["spam.csv", "ham.pkl", "eggs.xlsx"]
main(files_to_read)
I have a directory with hundreds of csv files that represent the pixels of a thermal camera (288x383), and I want to get the center value of each file (e.g. 144 x 191), and with each one of the those values collected, add them in a dataframe that presents the list with the names of each file.
Follow my code, where I created the dataframe with the lists of several csv files:
import os
import glob
import numpy as np
import pandas as pd
os.chdir("/Programming/Proj1/Code/Image_Data")
!ls
Out:
2021-09-13_13-42-16.csv
2021-09-13_13-42-22.csv
2021-09-13_13-42-29.csv
2021-09-13_13-42-35.csv
2021-09-13_13-42-47.csv
2021-09-13_13-42-53.csv
...
file_extension = '.csv'
all_filenames = [i for i in glob.glob(f"*{file_extension}")]
files = glob.glob('*.csv')
all_df = pd.DataFrame(all_filenames, columns = ['Full_name '])
all_df.head()
**Full_name**
0 2021-09-13_13-42-16.csv
1 2021-09-13_13-42-22.csv
2 2021-09-13_13-42-29.csv
3 2021-09-13_13-42-35.csv
4 2021-09-13_13-42-47.csv
5 2021-09-13_13-42-53.csv
6 2021-09-13_13-43-00.csv
You can loop through your files one by one, reading them in as a dataframe and taking the center value that you want. Then save this value along with the file name. This list of results can then be read in to a new dataframe ready for you to use.
result = []
for file in files:
# read in the file, you may need to specify some extra parameters
# check the pandas docs for read_csv
df = pd.read_csv(file)
# now select the value you want
# this will vary depending on what your indexes look like (if any)
# and also your column names
value = df.loc[row, col]
# append to the list
result.append((file, value))
# you should now have a list in the format:
# [('2021-09-13_13-42-16.csv', 100), ('2021-09-13_13-42-22.csv', 255), ...
# load the list of tuples as a dataframe for further processing or analysis...
result_df = pd.DataFrame(result)
I'm trying to pull out the rows from these CSVs where the state is "Pennsylvania": https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports_us
I have this code:
import glob
import pandas as pd
df = []
path = "/home/reallymemorable/Documents/git/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us/*.csv"
for fname in glob.glob(path):
row = df.loc[df['Province_State'] == 'Pennsylvania']
print(row)
I'm getting this error:
AttributeError: 'list' object has no attribute 'loc'
I understand that it's expecting a DataFrame but I've set df as a list. But I don't know how to make it a DataFrame so that my pattern matching works correctly.
What am I doing wrong?
In your code, df is the empty list [] that you define at the beginning, not a Pandas DataFrame. Did you forget to load the data:
path = "/home/reallymemorable/Documents/git/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us/*.csv"
for fname in glob.glob(path):
df = pd.read_csv(fname) # this line???
row = df.loc[df['Province_State'] == 'Pennsylvania']
print(row)
I am reading a CSV file into a pandas dataframe using Python. I want to read in a list of text files into a new column of the dataframe.
The original CSV file I'm reading from looks like this:
Name,PrivateIP
bastion001,10.238.2.166
logicmonitor001,10.238.2.52
logicmonitor002,45.21.2.13
The original dataframe looks like this.
code:
hosts_list = dst = os.path.join('..', '..', 'source_files', 'aws_hosts_list', 'aws_hosts_list.csv')
fields = ["Name", "PrivateIP"]
orig_df = pd.read_csv(hosts_list, skipinitialspace=True, usecols=fields)
print(f"Orig DF: {orig_df}")
output:
Orig DF:
Name PrivateIP
0 bastion001 10.238.2.166
1 logicmonitor001 10.238.2.52
2 logicmonitor002 45.21.2.13
The text directory has a bunch of text files in it with memory readings in each:
bastion001-memory.txt B-mmp-rabbitmq-core002-memory.txt logicmonitor002-memory.txt mmp-cassandra001-memory.txt company-division-rcsgw002-memory.txt
B-mmp-platsvc-core001-memory.txt haproxy001-memory.txt company-cassandra001-memory.txt mmp-cassandra002-memory.txt company-waepd001-memory.txt
B-mmp-platsvc-core002-memory.txt haproxy002-memory.txt company-cassandra002-memory.txt mmp-cassandra003-memory.txt company-waepd002-memory.txt
B-mmp-rabbitmq-core001-memory.txt logicmonitor001-memory.txt company-cassandra003-memory.txt company-division-rcsgw001-memory.txt company-waepd003-memory.txt
Each file looks similar to this:
cat haproxy001-memory.txt
7706172
I read each file into the existing dataframe.
rowcount == 0
text_path = '/home/tdun0002/stash/cloud_scripts/output_files/memory_stats/text/'
filelist = os.listdir(text_path)
for filename in filelist:
if rowcount == 0:
pass
else:
my_file = text_path + filename
print(f"Adding {filename} to DF")
try:
orig_df = pd.update(my_file)
print(f"Data Frame: {orif_df}")
++rowcount
except Exception as e:
print(f"An error has occurred: {e}")
But when I try to read the resulting dataframe again it has not been updated. I gave the new DF a new name for clarity.
code:
result_df = orig_df
pd.options.display.max_rows
print(f"\nResult Data Frame:\n{result_df}\n")
output:
Result Data Frame:
Name PrivateIP
0 bastion001 10.238.2.166
1 logicmonitor001 10.238.2.52
2 logicmonitor002 45.21.2.13
How can I create a new column called Memory in the DF and add the contents of the text files to that column?
Here's the code I hope would work. It's a bit clunky, but you'll get the idea. There are comments inside.
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
# get all files in the directory
# i used os.getcwd() to get the current directory
# if your text files are in another dir, then write exact dir location
# this gets you all files in your text dir
onlyfiles = [f for f in listdir(os.getcwd()) if isfile(join(os.getcwd(), f))]
# convert it to series
memory_series = pd.Series(onlyfiles)
# an apply function to get just txt files
# others will be returned as None
def file_name_getter(x):
names = x.split(".", maxsplit=1)
if names[1] == "txt":
return names[0]
else:
return None
# apply the function and get a new series with name values
mem_list = memory_series.apply(lambda x: file_name_getter(x))
# now read first line of txt files
# and this is the function for it
def get_txt_data(x):
if x != None:
with open(f'{x}.txt') as f:
return int(f.readline().rstrip())
else:
return 0
# apply the function, get a new series with memory values
mem_val_list = mem_list.apply(lambda x: get_txt_data(x))
# create a df where our Name and Memory data are present
# cast Memory data as int
df = pd.DataFrame(mem_val_list, columns=["Memory"], dtype="int")
df["Name"] = mem_list
# get rid of -memory now
def name_normalizer(x):
if x is None:
return x
else:
return x.rsplit("-", maxsplit=1)[0]
# apply function
df["Name"] = df["Name"].apply(lambda x: name_normalizer(x))
# our sample orig_df
orig_df = pd.DataFrame([["algo_2", "10.10.10"], ["other", "20.20.20"]], columns=["Name", "PrivateIP"])
# merge using on, so if we miss data; that data wont cause any problem
# all matching names will get their memory values
final_df = orig_df.merge(df, on="Name")
edit: fixed Name to be returned correctly. (xxx-memory to xxx)
So I have been trying to merge .csv files with Pandas, and trying to create a couple of functions to automate it but I keep having an issue.
My problem is that I want to stack one .csv after the other(same number of columns and different number of rows) but instead of getting a bigger csv with the same numer of columns , I get a bigger csv with more columns and rows(correct number of rows, incorrect number of columns(more columns than the ones that are supposed to be)).
The code Im using is this one:
import os
import pandas as pd
def stackcsv(content_folder):
global combined_csv
combined_csv= []
entries = os.listdir(content_folder)
for i in entries:
csv_path = os.path.join(content_folder, i)
solo_csv = pd.read_csv(csv_path,index_col=None)
combined_csv.append(solo_csv)
csv_final = pd.concat(combined_csv,axis = 0,ignore_index=True)
return csv_final.to_csv("final_data.csv",index = None, header = None)
I have 3.csv files that have a size of 20000x17, and I want to merge it into one of 60000x17. I suppose my error must be in the arguments of index, header, index_col, etc....
Thanks in advance.
So after modifying the code, it worked. First of all, as Serge Ballesta said, it is necesary to say to the read_csv that there is no header. Finally, using the sort = False, the function works perfectly. This is the final code that I have used, and the final .csv is 719229 rows × 17 columns long. Thanks to everbody!
import os
import pandas as pd
def stackcsv(content_folder):
global combined_csv
combined_csv= []
entries = os.listdir(content_folder)
for i in entries:
csv_path = os.path.join(content_folder, i)
solo_csv = pd.read_csv(csv_path,index_col=None,header = None)
combined_csv.append(solo_csv)
csv_final = pd.concat(combined_csv,axis = 0,sort = False)
return csv_final.to_csv("final_data.csv", header = None)
If the files have no header you must say it to read_csv. If you don't, the first line of each file is read as a header line. As a result the DataFrames have different column names and concat will add new columns. So you should read with:
solo_csv = pd.read_csv(csv_path,index_col=None, header=None)
Alternatively, there is no reason to decode them, and you could just concatenate the sequential files:
def stackcsv(content_folder):
with open("final_data.csv", "w") as fdout
entries = os.listdir(content_folder)
for i in entries:
csv_path = os.path.join(content_folder, i)
with open(csv_path) as fdin:
while True:
chunk = fdin.read()
if len(chunk) == 0: break
fdout.write(chunk)
Add parameter sort to False in the pandas concat function:
csv_final = pd.concat(combined_csv, axis = 0, ignore_index=True, sort=False)