How process csv to hdfs using apache nifi?

How process csv to hdfs using apache nifi? - python

Hello guys hope you doing well !
I have some csv files want to put them in hdfs and if a file already exists it should append his content to the existing content I tries a script in python but with no results
import os
import pandas as pd
from os import path
import sys,json
import csv
from csv import writer,reader
data = json.load(sys.stdin)
technologies = ['KPI_2G_NPO','GPRS']
old_path = data["old.path"]
filename = data["filename"]
old_path = old_path.replace("C:\\Users\\12\\Desktop\\APACHE~1\\NIFI-1~1.1\\","")
old_path = old_path.replace("/","")
old_path_list = old_path.split('\\')
def append_list_as_row(file_name, list_of_elem):
with open(file_name, 'a+', newline='') as write_obj:
csv_writer = writer(write_obj)
csv_writer.writerow(list_of_elem)
df = pd.read_csv(data["new.path"]+data["filename"])
columns = df.columns.values.tolist()
for tech in technologies:
if (tech in filename and old_path_list[0] in filename):
if path.exists("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv"):
header_saved = True
with open(data["new.path"]+data["filename"]) as file2:
header = next(file2)
header = next(file2)
if header_saved:
for line in file2:
append_list_as_row("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv",list(line.split(",")))
os.remove(data["new.path"]+data["filename"])
else:
df.to_csv("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv")
os.remove(data["new.path"]+data["filename"])
and here's my nifi pipline picture

Related

Python - Read files from folder and Write CSV file in format

import glob
import os
import csv
from collections import OrderedDict
#Remove output file if already exists. Resolve the append Issue
file_path = 'C:\\Users\\Desktop\\Cobol\\Outputs\\LOC3X.csv'
if os.path.isfile(file_path):
os.remove(file_path)
#
list_of_files = glob.glob('C:\\Users\\Desktop\\Cobol\\*.CBL') # Input files in Folder
Fields = ['Program Name', 'LinesofCode'] # to be displayed in output CSV file
# opening output csv file to write (Fields)
file_path = 'C:\\Users\\Desktop\\Cobol\\Outputs\\LOC3X.csv'
with open(file_path, 'a') as csvfile1:
csvwriter = csv.writer(csvfile1)
csvwriter.writerow(Fields)
csvfile1.close()
def process_files_loc(list_of_files):
for fileName in list_of_files:
with open(fileName) as i:
count = sum(1 for line in i)
my_dict = {i : count} #input filename and its lines of code
ordered_dict = OrderedDict() #using OrderedDict
print(ordered_dict)
# creating ordered dict from dict
ordered_dict = OrderedDict(my_dict)
print(ordered_dict)
# writing records of Program name and LinesofCode to output csv file
file_path = 'C:\\Users\\Desktop\\Cobol\\Outputs\\LOC3X.csv'
with open(file_path, 'a') as csvfile2:
csvwriter = csv.writer(csvfile2)
csvwriter.writerows(ordered_dict)
csvfile2.close()
process_files_loc(list_of_files)
Output in Teminal (Error):
PS C:\Users\Python-1> & C:/Users/AppData/Local/Programs/Python/Python310/python.exe c:/Users/Python-1/one.py
OrderedDict()
OrderedDict([(<_io.TextIOWrapper name='C:\\Users\\Desktop\\Cobol\\ABCDEFGH.CBL' mode='r' encoding='cp1252'>, 191)])
OrderedDict()
OrderedDict([(<_io.TextIOWrapper name='C:\\Users\\Desktop\\Cobol\\IJKLMNOP.CBL' mode='r' encoding='cp1252'>, 195)])
Actual output of file in Folder:
C:\Users\Desktop\Cobol\Outputs
Name Date Modified Type Size
LOC3X.csv 9/15/2022 time Comma Seperated 1KB
Problem: Script executed and Read 2 CBL files in the Folder, and created 1 CSV file in output folder. The output CSV file to have,
Program Name LinesofCode
ABCDEFGH.CBL 191
IJKLMNOP.CBL 195
However, the actual output lines in CSV file is,
Program Name LinesofCode

Try something like this:
import glob
import csv
import os
def process_files_loc(files):
res = []
for file in files:
with open(file) as f:
line_count = len([line.strip("\n") for line in f if line != "\n"])
res.append([os.path.basename(f.name), line_count])
return res
if __name__ == '__main__':
with open('C:\\Users\\Main\\Desktop\\test\\test.csv', 'w', newline='') as f:
csvwriter = csv.writer(f)
csvwriter.writerow(['Program Name', 'LinesofCode'])
csvwriter.writerows(process_files_loc(glob.glob('C:\\Users\\Main\\Desktop\\test\\*.PY')))
Result:
result
Regards,

Python iterate folder of csv and convert do json

I am amateur at python but I have a task of converting folder of csv to json files. I have this script working with specified CSV file but I have no idea how to make the script iterate thrue folder of csv and convert all of those csv to json. The original script:
import csv
import json
import pandas as pd
file = '/users/krzysztofpaszta/CSVtoGD/build-a-bridge.csv'
json_file = '/users/krzysztofpaszta/CSVtoGD/build-a-bridge.json'
#Odczyt pliku CSV
def read_CSV(file, json_file):
csv_rows = []
with open(file) as csvfile:
reader = csv.DictReader(csvfile)
field = reader.fieldnames
for row in reader:
csv_rows.extend([{field[i]:row[field[i]] for i in range(len(field))}])
convert_write_json(csv_rows, json_file)
#Zamiana CSV na JSON
def convert_write_json(data, json_file):
with open(json_file, "w") as f:
f.write(json.dumps(data, sort_keys=False, indent=4, separators=(',', ': ')))
f.write(json.dumps(data))
read_CSV(file,json_file)
someone will give me a hint?

You can use os functions, particularly os.listdir() to iterate over files in the directory, and safely generate new names with os.path.splitext():
import os
DIRECTORY = "/path/to/dir"
for f in os.listdir(os.fsencode(DIRECTORY)):
fn = os.fsdecode(f)
pre, ext = os.path.splitext(fn)
if ext == ".csv":
read_CSV(fn, pre + '.json')
The similar approach with pathlib would be:
from pathlib import Path
DIRECTORY = "/path/to/dir"
files = Path(DIRECTORY).glob('*.csv') # to process files only in this dir
files = Path(DIRECTORY).rglob('*.csv') # to process files in sub-directories recursively
for f in files:
read_CSV(f, str(f.with_suffix('.json'))) # use .with_suffix() for safe name generation

You can list the csv files in a folder using pathlib:
from pathlib import Path
csv_files = Path().glob('*.csv')
Then loop over the files:
for csv_file in csv_files:
csv_path = str(csv_file.absolute())
json_path = csv_path.replace('.csv', '.json')
read_CSV(csv_path, json_path)

How to set path with glob when the fileName is use for csv?

I'm looking for a solution about path making for glob and for pandas to_csv anyone have a solution ?
My code :
from glob import glob
import json
import pandas as pd
PathIn = 'c:\\Users\\***\\PycharmProjects\\Project\\In'
PathOut = 'c:\\Users\\***\\PycharmProjects\\Project\\Out'
for fileName in glob(PathIn + '*.json', recursive=True):
with open(fileName, 'rb') as f:
json_dict = json.load(f)
print(json_dict)
.
.
.
.
.
.
df.to_csv(PathOut + fileName + '.csv', sep=";")
He doesn't print me my JSON file so don't take any file in my In. And I don't have any CSV in my Output.

the key here is you want to create the output file in the relevant user dir based on the input file, so you could instead just get a list of the users dirs and iterate over each of them settting the in and output file then search the json files and create the csv in the coresponding dir. something like.
import json
from glob import glob
import os.path as op
basepath = r'C:\Users\***\PycharmProjects'
_in = 'In'
_out = 'Out'
suffix = '\*.json'
output_suffix = '.csv'
for path in glob(basepath):
in_dir = op.join(path, _in)
out_dir = op.join(path, _out)
for json_file in glob(in_dir + suffix, recursive=True):
in_file_name = op.basename(json_file)
out_file_name = in_file_name.split('.')[0] + output_suffix
output_file = op.join(out_dir, out_file_name)
with open(json_file) as jf:
json_data = json.load(jf)
print(json_data)
###do some stuff with the json
with open(output_file, 'w') as of:
of.write("some data or json stuff")

Just slightly modifying your code I think you missed a \ when writing the path for searching in the input directory.
For the output directory you need to build your filename by replacing the extension .json with .csv. There are many ways to do that:
for fileName in glob(PathIn + '\*.json', recursive=True):
with open(fileName, 'rb') as f:
json_dict = json.load(f)
print(json_dict)
out_file_name = os.path.split(fileName)[0] + '.csv'
out_file_dir = os.path.join(PathOut, out_file_name)
# Here do something with your output file

Python Progress Bar in nested loop

I'm translating some linux log data to a CSV for data analytics. Some of the instructions take some time so, I thought I would put in a progress bar for each file that is being translated. However, when putting in a progress bar with either the progresspar2 or tqdm, my pandas dataframes and null. There's no data at all. When I remove the progress bar, everything works as it should.
Here is my CSV translating function:
import pandas as pd
from dateutil import parser
from tqdm import trange
import os
import glob
import csv
import socket
def logsToCSV():
print("[+] Translating log to CSV")
log_file = open(CSV_FILE_PATH, "w", newline='')
csv_w = csv.writer(log_file)
for filename in glob.glob(os.path.join(LOGS_FILE_PATH, '*.txt')): # Find all files in path with .txt
data_file = open(filename, "r")
file_length = len(data_file.readlines())
for i in trange(file_length, desc='loop', leave=False): # Progress Bar Via TQDM
for new_line in data_file:
new_line = line.strip().split(" ")
date = str("%s %s %s" % (new_line[0], new_line[1], new_line[2])).strip()
date = parser.parse(date)
ip =str(new_line[5]).partition("/")
ip = str(ip[0]).strip()
try:
url = str(new_line[7]).strip()
except:
url = None
csv_w.writerow([date,ip,url])
TQDM is breaking something or I am implementing it incorrectly.
EDIT 1:
I figured it out. I was exhausting the file read during my readlines() to get the length. This works:
def logsToCSV():
print("[+] Translating log to CSV")
log_file = open(CSV_FILE_PATH, "w", newline='')
csv_w = csv.writer(log_file)
path, dirs, files = next(os.walk(LOGS_FILE_PATH))
log_num = len(files)
print(log_num)
for filename in glob.glob(os.path.join(LOGS_FILE_PATH, '*.txt')): # Find all files in path with .txt
data_file = open(filename, "r")
with open(filename, "r") as f:
file_length = len(f.readlines())
f.close()
pbar = tqdm(total=file_length)
for line in data_file:
new_line = line.strip().split(" ")
date = str("%s %s %s" % (new_line[0], new_line[1], new_line[2])).strip()
date = parser.parse(date)
ip =str(new_line[5]).partition("/")
ip = str(ip[0]).strip()
try:
url = str(new_line[7]).strip()
except:
url = None
csv_w.writerow([date,ip,url])
pbar.update(1)
pbar.close()

you can apply tqdm for your main loop :
from tqdm import tqdm
for i in tqdam(condition):

Read CSV from within Zip File

I have a directory of zip files (approximately 10,000 small files), within each is a CSV file I am trying to read and split into a number of different CSV files.
I managed to write the code to split the CSV files from a directory of CSVs, shown below, that reads the first atttribute of the CSV, and depending what it is write it to the relevent CSV.
import csv
import os
import sys
import re
import glob
reader = csv.reader(open("C:/Projects/test.csv", "rb"), delimiter=',', quotechar='"')
write10 = csv.writer(open('ouput10.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
write15 = csv.writer(open('ouput15.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
headings10=["RECORD_IDENTIFIER","CUSTODIAN_NAME","LOCAL_CUSTODIAN_NAME","PROCESS_DATE","VOLUME_NUMBER","ENTRY_DATE","TIME_STAMP","VERSION","FILE_TYPE"]
write10.writerow(headings10)
headings15=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","USRN","STREET_DESCRIPTION","LOCALITY_NAME","TOWN_NAME","ADMINSTRATIVE_AREA","LANGUAGE"]
write15.writerow(headings15)
for row in reader:
type = row[0]
if "10" in type:
write10.writerow(row)
elif "15" in type:
write15.writerow(row)
So I am now trying to read the Zip files rather than wasting time extracting them first.
This is what I have so far after following as many tutorials as I have found
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv = '.'.join([dataFile, 'csv'])
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv))
reader = csv.reader(data)
for row in reader:
print row
However and error gets thrown
AttributeError: 'str' object has no attribute 'reader'
Hopefully someone can show me how to change my CSV reading code that works to read the Zip file.
Much appreciated
Tim

Simple fix. You're overriding the csv module with your local csv variable. Just change the name of that variable:
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv_file = '.'.join([dataFile, 'csv']) #all fixed
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv_file)) #don't forget this line!
reader = csv.reader(data)
for row in reader:
print row

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How process csv to hdfs using apache nifi? - python

Related

Python - Read files from folder and Write CSV file in format

Python iterate folder of csv and convert do json

How to set path with glob when the fileName is use for csv?

Python Progress Bar in nested loop

Read CSV from within Zip File

Categories

Resources