Merge all csv in folder by columns without pandas - python

I'm using python3 and I want to merge few csv files by columns.
Is it possible to do without pandas?
For example if I have this two csv
df1:
Name Surname PCName
Max Petrov wrs123
Ivan Ivanov wrs321
df2:
Surname Name PCName
Sidorov Vasily wrs223
Dmitriev Alex wrs331
With pandas I've got this solution:
import os
import pandas as pd # $ pip install pandas
import time
def cls():
os.system('cls' if os.name=='nt' else 'clear')
cls()
today = time.strftime("%y%m%d")
fldpath = 'C:/tmp2/test/'
filepath = fldpath+today+"_merged.csv"
print(os.listdir(fldpath))
print("type begining of file names")
fmask = input()
file_list = [fldpath + f for f in os.listdir(fldpath) if f.startswith(fmask)]
csv_list = []
for file in sorted(file_list):
csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))
csv_merged = pd.concat(csv_list, ignore_index=True)
csv_merged.to_csv(filepath, index=False)

You could use a Python DictReader() and DictWriter() to do this as follows:
import csv
import os
import time
def cls():
os.system('cls' if os.name=='nt' else 'clear')
cls()
today = time.strftime("%y%m%d")
fldpath = 'C:/tmp2/test/'
filepath = fldpath + today + "_merged.csv"
print(os.listdir(fldpath))
print("type beginning of file names")
fmask = input()
file_list = [fldpath + f for f in os.listdir(fldpath) if f.startswith(fmask)]
with open(filepath, 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=["Name", "Surname", "PCName"])
csv_output.writeheader()
for file in sorted(file_list):
with open(file) as f_input:
csv_input = csv.DictReader(f_input)
csv_output.writerows(csv_input)
For your given example, this would produce an output of:
Name,Surname,PCName
Max,Petrov,wrs123
Ivan,Ivanov,wrs321
Vasily,Sidorov,wrs223
Alex,Dmitriev,wrs331
This assumes each CSV file has the same field names (order is not important)

Related

The csv writer is writing some un-realistic values to the csv in python

In my code, the csv-writer is writing some un-realistic values to the CSV file.
My goal is to read all csv files in one directory and put filter on any specific column and write the filtered dataframe to a consolidated csv file.
I am able to get the outputs as required in the VS console, but I am not able to write them into a csv file.
Kindly help to understand what I am doing incorrect.
This is my sample input:
And this is the output I am getting:
Code:
import pandas as pd
import os
import glob
import csv
from pandas.errors import EmptyDataError
# use glob to get all the csv files
# in the folder
path = os.getcwd()
#print(path)
csv_files = glob.glob(os.path.join(path, "*.csv"))
print(csv_files)
col_name = input("Enter the column name to filter: ")
print(col_name)
State_Input = input("Enter the {} ".format(col_name) )
print(State_Input)
df_empty = pd.DataFrame()
for i in csv_files:
try:
df = pd.read_csv(i)
#print(df.head(5))
State_Filter = df["State"] == State_Input
print(df[State_Filter])
df_child = (df[State_Filter])
with open('D:\\PythonProjects\\File-Split-Script\\temp\\output\\csv_fil111.csv', 'w') as csvfile:
data_writer = csv.writer(csvfile, dialect = 'excel')
for row in df_child:
data_writer.writerows(row)
except EmptyDataError as e:
print('There was an error in your input, please try again :{0}'.format(e))
Use pd.to_csv to write your file at once. Prefer store your filtered dataframes into a list then concatenate all of them to a new dataframe:
import pandas as pd
import pathlib
data_dir = pathlib.Path.cwd()
# Your input here
state = input('Enter the state: ') # Gujarat, Bihar, ...
print(state)
data = []
for csvfile in data_dir.glob('*.csv'):
df = pd.read_csv(csvfile)
df = df.loc[df['State'] == state]]
data.append(df)
df = pd.concat(data, axis=1, ignore_index=True)
df.to_csv('output.csv', axis=0)

How to change value in multiple CSV cells?

I have 118 CSVs, I need to go into each CSV and change F1, F2, F3 and so on to 0.
For example, in csv1, F1 = 0, in csv2, F2 = 0, in csv3, F3 = 0 and so on.
The CSV has headers:
I am assuming all of your CSV files have the same format, and that you are trying to set column F to be 0 for all of them.
You can use Python CSV library to help you as follows:
import csv
import glob
for filename in glob.glob('*.csv'):
print(f'Processing: {filename}')
with open(filename) as f_input:
csv_input = csv.reader(f_input)
header = next(csv_input)
rows = [[*row[:5], '0'] for row in csv_input]
with open(filename, 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(header)
csv_output.writerows(rows)
This reads all .csv files from a given folder and changes the Multi Col 2 values to 0. It does this for all rows but leaves the header the same.
Thank you all, I made my own solution, it is a lot less classy than the ones posted here. But I automated it from the point of needing x number of files to amending the col/row.
#==============================================================================
# Import the necessary packages
import os
#import glob
import shutil
import pathlib
import pandas as pd
#import numpy as np
#==============================================================================
InputPath = 'F:\\cells\\bc_dbase\\bc_dbase1.csv'
OutputPath = 'F:\\cells\\bc_dbase'
str1 = 'Name '
str2 = 'Mult Col 2'
NoStart = 1
NoEnd = 119
#==============================================================================
# Create complete path of folders
def CreatePath(FullPath,File=False):
Parts = pathlib.Path(FullPath).parts
for [n1,Folder] in enumerate(Parts):
if File==True and n1==len(Parts)-1 and "." in Parts[n1]:
continue
elif n1==0:
FolderPath = Parts[n1]
else:
FolderPath = os.path.join(FolderPath,Folder)
if os.path.exists(FolderPath)==False:
os.mkdir(FolderPath)
#==============================================================================
# Delete folder
def DeleteFolder(FullPath):
FullPath = pathlib.Path(FullPath)
try:
shutil.rmtree(FullPath)
except:
pass
#==============================================================================
CreatePath(OutputPath,File=False)
[FolderPath,File] = os.path.split(InputPath)
[FileName,FileExt] = os.path.splitext(os.path.basename(InputPath))
ReversedFileName = FileName[::-1]
AdjFileName = FileName
for n1 in reversed(range(len(AdjFileName))):
char = FileName[n1]
if char.isdigit():
AdjFileName = AdjFileName[:n1] + AdjFileName[(n1+1):]
else: break;
Data1 = pd.read_csv(InputPath)
Data2 = pd.DataFrame.copy(Data1)
NameCols = Data1.columns
if str2 in NameCols:
Data2.loc[:,str2] = 1
for n1 in range(NoStart,NoEnd+1):
NewFile = AdjFileName + str(n1) + FileExt
NewFilePath = os.path.join(OutputPath,NewFile)
Data3 = pd.DataFrame.copy(Data2)
index = Data3[Data3[str1]==n1].index[0]
Data3.loc[index,str2] = 0
Data3.to_csv(NewFilePath, index=False)
print('[INFO] Storing file:',NewFilePath)
#==============================================================================
Mr. Evans has pretty neat code using Python CSV library, so I will expand on it a bit to answer y
our specific question.
import csv
import glob
file_count = 0
for filename in glob.glob('*.csv'):
file_count += 1
print(f'Processing: {filename}')
with open(filename) as f_input:
csv_input = csv.reader(f_input)
header = next(csv_input)
line_count = 0
rows = []
for row in csv_input:
line_count += 1
if line_count == file_count:
rows.append([*row[:5], '0'])
else:
rows.append([*row[:6]])
with open(filename, 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(header)
csv_output.writerows(rows)
Note: the code will run for all the .csv files in the working directory and will run through the files in an alphabetic order.

How process csv to hdfs using apache nifi?

Hello guys hope you doing well !
I have some csv files want to put them in hdfs and if a file already exists it should append his content to the existing content I tries a script in python but with no results
import os
import pandas as pd
from os import path
import sys,json
import csv
from csv import writer,reader
data = json.load(sys.stdin)
technologies = ['KPI_2G_NPO','GPRS']
old_path = data["old.path"]
filename = data["filename"]
old_path = old_path.replace("C:\\Users\\12\\Desktop\\APACHE~1\\NIFI-1~1.1\\","")
old_path = old_path.replace("/","")
old_path_list = old_path.split('\\')
def append_list_as_row(file_name, list_of_elem):
with open(file_name, 'a+', newline='') as write_obj:
csv_writer = writer(write_obj)
csv_writer.writerow(list_of_elem)
df = pd.read_csv(data["new.path"]+data["filename"])
columns = df.columns.values.tolist()
for tech in technologies:
if (tech in filename and old_path_list[0] in filename):
if path.exists("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv"):
header_saved = True
with open(data["new.path"]+data["filename"]) as file2:
header = next(file2)
header = next(file2)
if header_saved:
for line in file2:
append_list_as_row("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv",list(line.split(",")))
os.remove(data["new.path"]+data["filename"])
else:
df.to_csv("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv")
os.remove(data["new.path"]+data["filename"])
and here's my nifi pipline picture

how to convert folder of pickle files into single csv file

I have a directory containing about 1700 pickle file, that every file is all Twitter post of the user, I want to convert it into a folder of CSV files, that every CSV file name is the name of the pickle file and each row contains one tweet of user...
after that, I want just the top 20 CSV with more samples than others... how can I do that?
# khabarlist = open_file_linebyline(pkl_path)
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
# if len(file.name.split()) > 1:
# continue
# if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf8') as f:
items = [i.strip() for i in f.read().split(",")]
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
open_dir_in_dict("Raw/")
I Wrote the sample code for it and it did not work...
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
if len(file.name.split()) > 1:
continue
if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf-8', errors='replace') as f:
print(f.readlines())
items = [i.strip() for i in f.read().split(",")] # encode('utf-8').strip()
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
# open_dir_in_dict("Raw/")
and a better answer...
import os
import pandas as pd
import regex as re
data_path = "/content/drive/My Drive/twint/Data/pkl/Data/"
for path in os.listdir(data_path):
my_tweets = []
df = pd.read_pickle(data_path + path)
for tweet in df.tweet:
url = re.findall(r"http\S+", tweet)
if url == []:
my_tweets.append(tweet)
new_df = pd.DataFrame({"tweets": my_tweets, "author": path.replace(".pkl", "")}) # path[:-4]
new_df.to_csv("/content/drive/My Drive/twint/final.csv", index=False, mode="a", )

Transposing all csv files within a folder

I got help the last time I asked a question on this site regarding batch processing csv files within a folder using glob.glob() with Python. I am trying to use it this time to transpose all csv files within a folder. The script below only processes the last file and stops. What am I doing wrong?
import csv
import os
import glob
directory = raw_input ("INPUT Folder")
output = raw_input("OUTPUT Folder:")
in_files = os.path.join(directory, '*.csv')
for in_file in glob.glob(in_files):
with open(in_file) as input_file:
reader = csv.reader(input_file)
cols = []
for row in reader:
cols.append(row)
filename = os.path.splitext(os.path.basename(in_file))[0] + '.csv'
with open (os.path.join(output, filename), 'wb') as output_file:
writer = csv.writer(output_file)
for i in range(len(max(cols, key=len))):
writer.writerow ([(c[i] if i<len(c) else '') for c in cols])
You need to indent the "output" portion of the code so that it runs once for each iteration of the for in_file loop:
import csv
import os
import glob
directory = raw_input ("INPUT Folder")
output = raw_input("OUTPUT Folder:")
in_files = os.path.join(directory, '*.csv')
for in_file in glob.glob(in_files):
with open(in_file) as input_file:
reader = csv.reader(input_file)
cols = []
for row in reader:
cols.append(row)
# "outdent" this code so it only needs to run once for each in_file
filename = os.path.splitext(os.path.basename(in_file))[0] + '.csv'
# Indent this to the same level as the rest of the "for in_file" loop!
with open (os.path.join(output, filename), 'wb') as output_file:
writer = csv.writer(output_file)
for i in range(len(max(cols, key=len))):
writer.writerow ([(c[i] if i<len(c) else '') for c in cols])
In your version that code only runs once, after the for in_file loop has completed, and therefore only outputs cols data left over from the final iteration of that loop.
I have also "outdented" the filename = ... statement to the for in_file level, as this only needs to be done once for each in_file, not once for each row of each in_file.
You can get a lot of mileage with data manipulation using pandas:
import os
import pandas as pd
for filename in os.listdir('.'):
# We save an augmented filename later,
# so using splitext is useful for more
# than just checking the extension.
prefix, ext = os.path.splitext(filename)
if ext.lower() != '.csv':
continue
# Load the data into a dataframe
df = pd.DataFrame.from_csv(filename,
header=None,
index_col=None,
parse_dates=False)
# Transpose is easy, but you could do TONS
# of data processing here. pandas is awesome.
df_transposed = df.T
# Save to a new file with an augmented name
df_transposed.to_csv(prefix+'_T'+ext, header=True, index=False)
The os.walk version is not much different, if you need to dig into subfolders as well.
Here is a working one:
had to google for an hour, but works and tested on python33
import csv
import os
import glob
directory = 'C:\Python33\csv'
output = 'C:\Python33\csv2'
in_files = os.path.join(directory, '*.csv')
for in_file in glob.glob(in_files):
with open(in_file) as input_file:
reader = csv.reader(input_file)
cols = []
for row in reader:
cols.append(row)
# "outdent" this code so it only needs to run once for each in_file
filename = os.path.splitext(os.path.basename(in_file))[0] + '.csv'
# Indent this to the same level as the rest of the "for in_file" loop!
with open (os.path.join(output, filename), 'w') as output_file:
writer = csv.writer(output_file)
for i in range(len(max(cols, key=len))):
writer.writerow ([(c[i] if i<len(c) else '') for c in cols])
in_files will only return a single result in that format. Try returning a list:
in_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

Categories

Resources