I want to create a dataset.csv file from the various raw files within my input_path. However, my code didn't seem to generate the csv file.
import os
import pandas as pd
from zipfile import ZipFile
import cv2
import re
import csv
from sklearn.dummy import DummyClassifier
input_path = "../input_data/"
class RawToCSV:
def __init__(self, path_, df):
self.df = df
self.measurement_df = None
self.cls = None
self.path_ = path_
def raw_file_processing(self, path_):
# Open all the subfolders within path
for root, dirs, files in os.walk(path_):
for file in files:
with open(os.path.join(root, file), "r") as data:
df = pd.read_csv(data)
# Create the ID series by concatenating columns 1-3
df = df.assign(ID=df[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
lambda row: '_'.join([str(each) for each in row]), axis=1))
df = df.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])
# The class info is the tile_num (i.e, column 3) - TODO: Check if class info is tile_num
cls_col = df.iloc[2]
# Dummy-code the classes
cls = pd.get_dummies(cls_col)
# Obtain measurement info
# Normalize data against blank/empty columns
# log-transform the data
for col in df[9:]:
if re.search(r"(Blank|Empty)$", col):
background = col
else:
line = col.readline()
for data in line:
norm_data = data/ background
self.measurement_df = np.log2(norm_data)
return self.df["ID"], cls, self.measurement_df
def dataset_csv(self):
"""Col 1: ID
Col 2: class
Col 3-n: measurements"""
ids = self.df["ID"]
id_col = ids.to_frame()
cls_col = self.cls.to_frame()
frames = [id_col, cls_col, self.measurement_df]
dataset_df = pd.concat(frames)
data_csv = dataset_df.to_csv(r"../input_data/dataset.csv")
return data_csv
dataset = RawToCSV.dataset_csv(input_path)
Desired output dataset.csv
ID
Class
measurement_A
measurement_B
Sample1
value
value
value
Sample2
value
value
value
Not sure if you need to treat your url path as a row string. You should simply try:
data_csv = dataset_df.to_csv("../input_data/dataset.csv")
Ps : and as a test, start by saving your csv in the same folder where your script is:
data_csv = dataset_df.to_csv("dataset.csv")
Related
In my code, the csv-writer is writing some un-realistic values to the CSV file.
My goal is to read all csv files in one directory and put filter on any specific column and write the filtered dataframe to a consolidated csv file.
I am able to get the outputs as required in the VS console, but I am not able to write them into a csv file.
Kindly help to understand what I am doing incorrect.
This is my sample input:
And this is the output I am getting:
Code:
import pandas as pd
import os
import glob
import csv
from pandas.errors import EmptyDataError
# use glob to get all the csv files
# in the folder
path = os.getcwd()
#print(path)
csv_files = glob.glob(os.path.join(path, "*.csv"))
print(csv_files)
col_name = input("Enter the column name to filter: ")
print(col_name)
State_Input = input("Enter the {} ".format(col_name) )
print(State_Input)
df_empty = pd.DataFrame()
for i in csv_files:
try:
df = pd.read_csv(i)
#print(df.head(5))
State_Filter = df["State"] == State_Input
print(df[State_Filter])
df_child = (df[State_Filter])
with open('D:\\PythonProjects\\File-Split-Script\\temp\\output\\csv_fil111.csv', 'w') as csvfile:
data_writer = csv.writer(csvfile, dialect = 'excel')
for row in df_child:
data_writer.writerows(row)
except EmptyDataError as e:
print('There was an error in your input, please try again :{0}'.format(e))
Use pd.to_csv to write your file at once. Prefer store your filtered dataframes into a list then concatenate all of them to a new dataframe:
import pandas as pd
import pathlib
data_dir = pathlib.Path.cwd()
# Your input here
state = input('Enter the state: ') # Gujarat, Bihar, ...
print(state)
data = []
for csvfile in data_dir.glob('*.csv'):
df = pd.read_csv(csvfile)
df = df.loc[df['State'] == state]]
data.append(df)
df = pd.concat(data, axis=1, ignore_index=True)
df.to_csv('output.csv', axis=0)
I need a little help in appending the data thats getting generated out of the for loop below. Currenlty, im writing it to a dataframe in line "df = pd.DataFrame(li_row, columns=col_names)"
But when I have multiple files which starts from PAJ, I need the resulted Dataframe to be appended to one Dataframe.
Also, the below is a bits and pieces we gathered and amended to suit our need. please excuse me in case you feel its a mess. :)
import xmlschema
import os
import xml.etree.ElementTree as ET
import pandas as pd
dirpath = "C:\\Users\\xxxxx\\PycharmProjects\\pythonProject\\xmls"
filenames = os.listdir("C:\\Users\\xxxxx\\PycharmProjects\\pythonProject\\xmls")
# print(filenames)
for eachfile in filenames:
fname = eachfile[0:3]
print(dirpath+'\\'+eachfile)
if fname == 'PAJ':
xmlschema.validate(dirpath+'\\'+eachfile, 'PAJ.xsd')
tree = ET.parse(eachfile)
root = tree.getroot()
# Get AlertID from header
cols = {}
for header in root.findall(".//header/alertId"):
cols[header.tag] = header.text
# print(cols)
# get detailhr to be used for column header names
col_names = []
for DtHeader in root.findall(".//detailHdr/c"):
col_names.append(DtHeader.text)
# print(col_names)
# Get row and c
li_row = []
size = 0
for Data in root.findall(".//report/data"):
for child in Data:
# print(child.tag,child.text,len(Data))
li_row.append([])
for grandchild in child:
# print(grandchild.tag, grandchild.text,len(child))
li_row[size].append(grandchild.text)
size += 1
# print(li_row)
# create a dataframe with the col_names and row with c and alertid added at the end
df = pd.DataFrame(li_row, columns=col_names)
df['alertId'] = cols['alertId']
print(df)
elif fname == 'PIE':
fileContent = ''
with open(dirpath + '\\' + eachfile) as filehandle:
fileContent = filehandle.read()
modFileContent = fileContent.replace("UTF-16", "UTF-8")
xmlschema.validate(modFileContent, 'PIE.xsd')
So if i were to change your current solution as little as possible I create a list of paj_data_frames and concatenate them once the script was done. Look at pd.concat documentation https://pandas.pydata.org/docs/user_guide/merging.html
paj_data_frames = []
for eachfile in filenames:
....
if fname == 'PAJ':
df = pd.DataFrame(li_row, columns=col_names)
df['alertId'] = cols['alertId']
paj_data_frames.append(df)
....
final_df = pd.concat(paj_data_frames)
I have a python script for generating 1 upload file from 1 input file.
The thing is that the input files have started coming in batches, 30-50 at one time.
e.g.:
1111.xlsx --> upload.xlsx
1125.xlsx --> upload.xlsx
1176.xlsx --> upload.xlsx
1322.xlsx --> upload.xlsx
The code just converting the input files in the upload format.
Here's what I have done so far (1 input file -> 1 output file):
def main():
initial_workbook = 'C:/files/1111.xlsx'
temp_df = pd.ExcelFile(initial_workbook)
initial_df = pd.read_excel(initial_workbook, sheet_name = "default")
#drop first 4 rows to set header
new_header = initial_df.iloc[2]
initial_df = initial_df.iloc[3:]
initial_df.columns = new_header
#drop all rows with no data
indexNames = initial_df[initial_df['grade'] == 'select'].index
initial_df.drop(indexNames , inplace=True)
initial_df.dropna(axis=1, how='all')
output = initial_df.to_excel('C:/files/upload_file.xlsx', index = False)
Is there a way to generate one upload file for all the files from the input folder. And once the files input files have been processed, rename them by prefixing x in front of it. e.g. x1111.xlsx
So here is how I will approach, for a given batch:
from datetime import datetime
import os
from pathlib import Path
all_dfs = []
proj_path = Path("C:/files/")
for f in os.listdir(proj_path):
if f.endswith(".xlsx"):
print(f"processing {f}...")
df_tmp = main(proj_path / f)
df_tmp["file_name"] = f
all_dfs.append(df_tmp)
df_all = pd.concat(all_dfs, axis=0)
df_all.to_excel(proj_path / f"{datetime.now()}_batch.xlsx", index = False)
def main(f):
initial_workbook = proj_path / f
temp_df = pd.ExcelFile(initial_workbook)
initial_df = pd.read_excel(initial_workbook, sheet_name = "default")
#drop first 4 rows to set header
new_header = initial_df.iloc[2]
initial_df = initial_df.iloc[3:]
initial_df.columns = new_header
#drop all rows with no data
indexNames = initial_df[initial_df['grade'] == 'select'].index
initial_df.drop(indexNames, inplace=True)
initial_df.dropna(axis=1, how='all', inplace=True)
return initial_df
You can potentially enclose the logic for a batch in a function.
I have a datafile which is the result of combining several sources that contain name information. Each name have a unique ID (Column ID).
Sorting the ID by column, I would like to remove the second/third source finding in the column Source.
My output today:
(all the red rows are "duplicates" since we already got them from the first source (blue rows))
What I would like to achieve:
How can I achieve this result?
Is there a way to iterate row by row, where I remove duplicate of ID already when I iterate in the function "for file in files:" part of the code?
Or is it easier to do it in the "df_merged" before I output the dataframe to an an excel file?.
Code:
import pandas as pd
import os
from datetime import datetime
from shutil import copyfile
from functools import reduce
import numpy as np
#Path
base_path = "G:/Till/"
# Def
def get_files(folder, filetype):
list_files = []
directory = os.fsencode(folder)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith("." + filetype.strip().lower()):
list_files.append(filename)
return list_files
# export files
df_result_e = pd.DataFrame()
files = get_files(base_path + "datasource/" + "export","xlsx")
df_append_e = pd.DataFrame()
for file in files:
df_temp = pd.read_excel(base_path + "datasource/" + "export/" + file, "Results", dtype=str, index=False)
df_temp["Source"] = file
df_append_e = pd.concat([df_append_e, df_temp])
df_result_e = pd.concat([df_result_e, df_append_e])
print(df_result_e)
# match files
df_result_m = pd.DataFrame()
files = get_files(base_path + "datasource/" + "match","xlsx")
df_append_m = pd.DataFrame()
for file in files:
df_temp = pd.read_excel(base_path + "datasource/" + "match/" + file, "Page 1", dtype=str, index=False)
df_append_m = pd.concat([df_append_m, df_temp])
df_result_m = pd.concat([df_result_m, df_append_m])
df_result_m = df_result_m[['ID_Our','Name_Our','Ext ID']]
df_result_m.rename(columns={'ID_Our' : 'ID', 'Name_Our' : 'Name' , 'Ext ID' : 'Match ID'}, inplace=True)
df_result_m.dropna(subset=["Match ID"], inplace=True) # Drop all NA
data_frames = [df_result_e, df_result_m]
# Join files
df_merged = reduce(lambda left,right: pd.merge(left, right, on=["Match ID"], how='outer'), data_frames)
#Output of files
df_merged.to_excel(base_path + "Total datasource Export/" + datetime.now().strftime("%Y-%m-%d_%H%M") + ".xlsx", index=False)
For remove them you can try transform with factorize
newdf=df[df.groupby('ID')['Source'].transform(lambda x : x.factorize()[0])==0]
I am trying to run through a set of CSV files in order to compile a results CSV file. I'm getting an error that my function is undefined for some reason. Can you tell me why? Thanks.
def response_amp(data):
import pandas as pd
import numpy as np
#putting in and cutting out unnecessary parts of the data
df = pd.read_csv('data.csv', encoding = 'utf-8')
df = df[:-1]
a = df.columns[df.columns.str.startswith('ยต')]
df = df[a]
dfd = df.drop(df.index[:30]) #dropping the section with no sample
#splitting the data into chunks so response values can be measure
df1d = dfd[:320] #first interval
df2d = dfd[330:470] #second interval
df3d = dfd[480:] #third interval
#rolling avg on each
df1r = df1d.rolling(5, win_type='gaussian').sum(std=4)
df2r = df2d.rolling(5, win_type='gaussian').sum(std=4)
df3r = df3d.rolling(5, win_type='gaussian').sum(std=4)
bsln_1 = df1r.iloc[3:6].mean()
bsln_2 = df2r.iloc[3:6].mean()
bsln_3 = df3r.iloc[3:6].mean()
response_1 = abs(df1r.min()-bsln_1)/bsln_1
response_2 = abs(df1r.min()-bsln_2)/bsln_2
response_3 = abs(df1r.min()-bsln_3)/bsln_3
response = response_1,response_2,response_3
return(response)
import os
directory =(r'file directory goes here')
response = []
for filename in os.listdir(directory):
if filename.endswith(".csv"):
response.append(response_amp(filename))
a = numpy.asarray(response)
numpy.savetxt("ks_response.csv", a, delimiter=",")
Thanks for the help.