read csv in a for loop using pandas - python

inp_file=os.getcwd()
files_comp = pd.read_csv(inp_file,"B00234*.csv", na_values = missing_values, nrows=10)
for f in files_comp:
df_calculated = pd.read_csv(f, na_values = missing_values, nrows=10)
col_length=len(df.columns)-1
Hi folks, How can I read 4 csv files in a for a loop. I am getting an error while reading the CSV in above format. Kindly help me

You basically need this:
Get a list of all target files. files=os.listdir(path) and then keep only the filenames that start with your pattern and end with .csv.
You could also improve it using regular expression (by importing re library for more sophistication, or use glob.glob).
filesnames = os.listdir(path)
filesnames = [f for f in filesnames if (f.startswith("B00234") and f.lower().endswith(".csv"))]
Read in files using a for loop:
dfs = list()
for filename in filesnames:
df = pd.read_csv(filename)
dfs.append(df)
Complete Example
We will first make some dummy data and then save that to some .csv and .txt files. Some of these .csv files will begin with "B00234" and some other would not. We will write the dumy data to these files. And then selectively only read in the .csv files into a list of dataframes, dfs.
import pandas as pd
from IPython.display import display
# Define Temporary Output Folder
path = './temp_output'
# Clean Temporary Output Folder
import shutil
reset = True
if os.path.exists(path) and reset:
shutil.rmtree(path, ignore_errors=True)
# Create Content
df0 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
columns=['a', 'b', 'c'])
display(df0)
# Make Path
import os
if not os.path.exists(path):
os.makedirs(path)
else:
print('Path Exists: {}'.format(path))
# Make Filenames
filenames = list()
for i in range(10):
if i<5:
# Create Files starting with "B00234"
filenames.append("B00234_{}.csv".format(i))
filenames.append("B00234_{}.txt".format(i))
else:
# Create Files starting with "B00678"
filenames.append("B00678_{}.csv".format(i))
filenames.append("B00678_{}.txt".format(i))
# Create files
# Make files with extensions: .csv and .txt
# and file names starting
# with and without: "B00234"
for filename in filenames:
fpath = path + '/' + filename
if filename.lower().endswith(".csv"):
df0.to_csv(fpath, index=False)
else:
with open(fpath, 'w') as f:
f.write(df0.to_string())
# Get list of target files
files = os.listdir(path)
files = [f for f in files if (f.startswith("B00234") and f.lower().endswith(".csv"))]
print('\nList of target files: \n\t{}\n'.format(files))
# Read each csv file into a dataframe
dfs = list() # a list of dataframes
for csvfile in files:
fpath = path + '/' + csvfile
print("Reading file: {}".format(csvfile))
df = pd.read_csv(fpath)
dfs.append(df)
The list dfs should have five elements, where each is dataframe read from the files.
Ouptput:
a b c
0 1 2 3
1 4 5 6
2 7 8 9
List of target files:
['B00234_3.csv', 'B00234_4.csv', 'B00234_0.csv', 'B00234_2.csv', 'B00234_1.csv']
Reading file: B00234_3.csv
Reading file: B00234_4.csv
Reading file: B00234_0.csv
Reading file: B00234_2.csv
Reading file: B00234_1.csv

Related

Working with multiple ZIP files inside directory and convert, rename the files with Python

I have directory that have a lot of ZIP files. The ZIP files contain a lot of CSV files. First, I want to change the format of CSV files into parquet. Second, I need to rename all the parquet files and store the data into CSV. (Code below). I need to work with the zip files and not extracting the files to save some storage space.
Below is the code to convert to .parquet.
flist = ul.get_flist(r"D:\Proyekan\Data yang udah di extract", "csv")
target_folder = "D:\\Proyekan\\Data yang udah di extract\\Parquet\\"
for i, fpath in enumerate(flist):
#fname = fpath.split('\\')[-1]
df = pd.read_csv(fpath)
fname = fpath.split('\\')[-1].split('.')[0] + '.parquet'
print(f"{i:03} ... Working on file ... {fname}")
df.to_parquet(f"{target_folder}{fname}", compression="gzip")
And below is the code to rename the files
import os
import pandas as pd
#This is to rename files
path = "D:\Proyekan\Data FDM"
count = 1
ori_filename = []
new_filename = []
folder = []
head, tail = os.path.split(path)
for root, dirs, files in os.walk(path):
for file in files:
new_filecode = "flight_" + str(1000000 + count) +".mat"
ori_filename.append(os.path.basename(file))
new_filename.append(new_filecode)
folder.append(os.path.basename(root))
fullpath = os.path.join(root,file)
os.rename(fullpath, os.path.join(root, new_filecode))
count += 1
#Store data to csv
df = pd.DataFrame(list(zip(ori_filename, new_filename, folder)), columns = ['raw_file','file_id','tail_number'])
df.to_csv(r'D:\Proyekan\FILES\Metadata.csv',index = False, header = True)
Any ideas how do I edit this code to read ZIP files? Any help would be appreciated
this is a shot in the dark, try it and let's see how it goes:
from zipfile import ZipFile
folder = 'list of zipped files'
#iterate through every zip file
for zips in folder:
with ZipFile(zips) as myzip:
for csv in myzip.namelist():
with myzip.open(csv) as myfile:
#read the csv in the zip and convert to parquet
#if this does not work, u could break down the steps
#read the data first into a variable, then to parquet next
pd.read_csv(myfile).to_parquet('new_location')

Import multiple excel files and merge into single pandas df with source name as column

I'm trying to merge a bunch of xlsx files into a single pandas dataframe in python. Furthermore, I want to include a column that lists the source file for each row. My code is as follows:
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import glob
import os
# get the path for where the xlsx files are
path = os.getcwd()
files = os.listdir(path)
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
# create new dataframe
df = pd.DataFrame()
# read data from files and add into dataframe
for f in files_xlsx:
data = pd.read_excel(f, 'Sheet 1')
df['Source_file'] = f
df = df.append(data)
however, when I look at the 'Source_file' column it lists the final file it reads as the name for every row. I've spent way more time than I should trying to fix this. What am I doing wrong?
within your for loop you are writing over each iteration of df so you'll only get back the final file,
what you need to do is delcare a list before hand and append do that,
since you called glob lets use that as well.
files = glob.glob(os.path.join(os.getcwd()) + '\*.xlsx')
dfs = [pd.read_excel(f,sheet_name='Sheet1') for f in files]
df = pd.concat(dfs)
if you want to add the filename into the df too then,
files = glob.glob(os.path.join(os.getcwd()) + '\*.xlsx')
dfs = [pd.read_excel(f,sheet_name='Sheet1') for f in files]
file_names = [os.path.basename(f) for f in files]
df = pd.concat(dfs,keys=file_names)
Using Pathlib module (recommended Python 3.4+)
from pathlib import Path
files = [f for f in Path.cwd().glob('*.xlsx')]
dfs = [pd.read_excel(f,sheet_name='Sheet1') for f in files]
file_names = [f.stem for f in files]
df = pd.concat(dfs,keys=file_names)
or as a one liner :
df = pd.concat([pd.read_excel(f) for f in Path.cwd().glob('*.xlsx')],keys=[f.stem for f in Path.cwd().glob('*.xlsx')],sort=False)

Problrms reading csv files via numpy and pandas

I have a folder with a certain path and I want to go through all the folders within this folder. Each subfolder contains multiple files of which I only want a certain ".csv" file. I succeed in reading the different folders and selecting the correct file, but when I try to open it (bith with pandas and numpy), I get an IOError stating that the corresponding file doesn't exist..
import pandas as pd
import numpy as np
path = "some_path"
data_list = os.listdir(path)
array = []
for file in data_list:
if file.endswith("name_requirement"):
array.append(function(file))
file_filter = "second_name_requirement"
def function(second_path):
file_list = os.listdir(path + "\\" + str(second_path))
average = 0
for file in file_list:
if str(file)[-20:-4] == file_filter:
measurements = pd.read_csv(file, delimiter = ";", skiprows = 1, usecols = [1])
# measurements = np.loadtxt(file, delimiter =";", skiprows =1)
.....
`

FileNotFoundError: [duplicate]

This question already has answers here:
How to do a recursive sub-folder search and return files in a list?
(13 answers)
Closed 7 months ago.
There are lot of csv excel sheets in a folder. All excel sheet contains data only in first 3 columns. I will select corresponding csv sheet from a lot of csv sheets and then plot it.Here is the code
import os
path = "F:\\Users\\Desktop\\Data\\Summary"
files = []
folder_data = os.listdir(path)
folder_data = [i+"\\" for i in folder_data]
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
for file in f:
if '.csv' in file:
files.append(file)
for i, f in enumerate(files):
print(( i,f))
print('\n'.join(f'{i}-{v}' for i,v in enumerate(files)))
csv_code = str(int(input("Enter corresponding code to plot: ")))
csv_path = path + "\\" + folder_data[csv_code]
df = pd.read_csv(csv_path, header=None)
df1 = df.iloc[:,0:2]
plt.plot(df1[0], df1[1])
When i run the code i want the Output to be displayed as follows (i mean i want all csv files from the folder to be displayed so that i can select what i want):
0-Test_Summary_1.csv
1-Test_Summary_2.csv
2-Test_Summary_3.csv
3-Test_Summary_4.csv
4-Test_Summary_5.csv
5-Test_Summary_6.csv etc
The error i a getting is
FileNotFoundError:
In spite of csv file is there in the folder. I am getting error as File not found error
If I understand your question correctly, then you could try something like this:
import os
import pandas as pd
# see this answer about absolute paths in windows
# https://stackoverflow.com/a/7767925/9225671
base_path = os.path.join('f:', os.sep, 'Users', 'Desktop', 'Data', 'Summary')
# collect all CSV files in 'base_path' and its subfolders
csv_file_list = []
for dir_path, _, file_name_list in os.walk(base_path):
for file_name in file_name_list:
if file_name.endswith('.csv'):
# add full path to the list, not just 'file_name'
csv_file_list.append(
os.path.join(dir_path, file_name))
print('CSV files that were found:')
for i, file_path in enumerate(csv_file_list):
print(' {:3d} {}'.format(i, file_path))
selected_i = int(input('Enter corresponding number of the file to plot: '))
selected_file_path = csv_file_list[selected_i]
print('selected_file_path:', selected_file_path)
df = pd.read_csv(selected_file_path, header=None)
...
Does this work for you?

Reading text files from subfolders and folders and creating a dataframe in pandas for each file text as one observation

I have the following architecture of the text files in the folders and subfolders.
I want to read them all and create a df. I am using this code, but it dont work well for me as the text is not what I checked and the files are not equivalent to my counting.
l = [pd.read_csv(filename,header=None, encoding='iso-8859-1') for filename in glob.glob("2018_01_01/*.txt")]
main_df = pd.concat(l, axis=1)
main_df = main_df.T
for i in range(2):
l = [pd.read_csv(filename, header=None, encoding='iso-8859-1',quoting=csv.QUOTE_NONE) for filename in glob.glob(str(foldernames[i+1])+ '/' + '*.txt')]
df = pd.concat(l, axis=1)
df = df.T
main_df = pd.merge(main_df, df)
file
Assuming those directories contain txt files in which information have the same structure on all of them:
import os
import pandas as pd
df = pd.DataFrame(columns=['observation'])
path = '/path/to/directory/of/directories/'
for directory in os.listdir(path):
if os.path.isdir(directory):
for filename in os.listdir(directory):
with open(os.path.join(directory, filename)) as f:
observation = f.read()
current_df = pd.DataFrame({'observation': [observation]})
df = df.append(current_df, ignore_index=True)
Once all your files have been iterated, df should be the DataFrame containing all the information in your different txt files.
You can do that using a for loop. But before that, you need to give a sequenced name to all the files like 'fil_0' within 'fol_0', 'fil_1' within 'fol_1', 'fil_2' within 'fol_2' and so on. That would facilitate the use of a for loop:
dataframes = []
import pandas as pd
for var in range(1000):
name = "fol_" + str(var) + "/fil_" + str(var) + ".txt"
dataframes.append(pd.read_csv(name)) # if you need to use all the files at once
#otherwise
df = pd.read_csv(name) # you can use file one by one
It will automatically create dataframes for each file.

Categories

Resources