Processing multiple files and write a csv file for each - python

I wrote a code that works fine for single file, but I have to change the names for each file. It reads a pickle file, write it into a txt file, then does some process on the context of txt file and produce a list of numbers, at the end stores the list in a dataframe and write that dataframe in csv file.
def get_value_of_list(bit_list):
p_number = 0
for i in bit_list:
if i == 1:
p_number = p_number + 1
return p_number
def cross_entropy(p, q):
return -sum([p[i] * log2(q[i]) for i in range(len(p))])
if __name__ == "__main__":
file_name = 'pickleData_AIMchat2.txt'
pickle_file = 'AIMchat2.pickle'
pk = PickleToFile(file_name, pickle_file)
pk.create_pickle_file()
h = HexToBinary(file_name)
hex_list = h.read_file()
num_of_bits = 8
scale = 16
bin_data = []
for i in hex_list:
bin_data.append(bin(int(i, scale))[2:].zfill(num_of_bits))
my_bit_list = []
for byte in bin_data:
bit_list = []
for bit in byte:
bit_list.append(int(bit))
num_of_one_divided_by_eight = get_value_of_list(bit_list) / 8
my_bit_list.append(num_of_one_divided_by_eight)
cross_entropy_list = []
i = 0
while i < len(my_bit_list):
cross = cross_entropy([my_bit_list[i]], [my_bit_list[i + 1]])
cross_entropy_list.append(cross)
i = i + 2
df = pd.DataFrame(cross_entropy_list)
df.to_csv(r'AIMchat2.csv', index=False, index_label=False, chunksize=1000000, header=False)
I have changed create_pickle_file() to the code below to read files in the directory:
class PickleToFile:
def __init__(self, name, pickle_file):
self.name = name
self.pickle_file = pickle_file
def create_pickle_file(self):
basepath = Path()
files_in_basepath = basepath.iterdir('pickle/')
for item in files_in_basepath:
if item.is_file():
checkThePickle = open(self.pickle_file, "rb")
with open(self.name, 'w') as filehandler:
for listItem in checkThePickle:
filehandler.write('%s\n' % listItem)
But since after reading file it writes it to a text file and then a csv file, I don't know how to do that. Appreciate any suggestions.

If you are looking to get a list of files in directory and process them, this should get you what you want:
How do I list all files of a directory?
Once you have this list of files, do a loop:
for each in list_of_files:
process_function(each)
Then, you are on your way, where 'process_function' is the function, and the argument is the filename.

Related

f.read() doesn't read nothing in the 2nd iteration of a loop

I am trying to create a script that reads files from a directory and create a .txt file that maps the name of the file with its hash code (SHA256).
These are the imports and global variable declarations:
import os
from unicodedata import name
import hashlib
hashsha = hashlib.sha256()
directorio_config = r"C:\Users\santi\Documents\wsp_py\PA1\config"
directorio = r"C:\Users\santi\Documents\wsp_py\PA1\ficheros"
aux = dict()
BUF_SIZE = 65536 # lets read stuff in 64kb chunks!
I have two functions:
def get_ficheros(directorio, directorio_config):
with os.scandir(directorio) as ficheros:
for elemento in ficheros:
if elemento.is_file():
nombre_fichero = elemento.name
ruta = directorio + "\\" + nombre_fichero
hash_code = hash_file(ruta)
aux[elemento.name] = hash_code
else:
newdir = directorio + "\\" + elemento.name
get_ficheros(newdir, directorio_config)
return aux
This one is suposed to read the files from a directory and put them in a dictionary where the value is the hash calculated by the hash_file function.
Here it is:
def hash_file(filename):
with open(filename, 'rb') as f:
while True:
data = f.read(BUF_SIZE)
if not data:
break
hashsha.update(data)
f.close()
return hashsha.hexdigest()
This code works properly untill the first iteration. In the second iteration, f.read() reads b'' and it is stored in the variable "data" instead the second file hash code as it is expected.
I have no idea why this happens. Any help is appriciated. Thnks

how to convert folder of pickle files into single csv file

I have a directory containing about 1700 pickle file, that every file is all Twitter post of the user, I want to convert it into a folder of CSV files, that every CSV file name is the name of the pickle file and each row contains one tweet of user...
after that, I want just the top 20 CSV with more samples than others... how can I do that?
# khabarlist = open_file_linebyline(pkl_path)
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
# if len(file.name.split()) > 1:
# continue
# if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf8') as f:
items = [i.strip() for i in f.read().split(",")]
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
open_dir_in_dict("Raw/")
I Wrote the sample code for it and it did not work...
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
if len(file.name.split()) > 1:
continue
if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf-8', errors='replace') as f:
print(f.readlines())
items = [i.strip() for i in f.read().split(",")] # encode('utf-8').strip()
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
# open_dir_in_dict("Raw/")
and a better answer...
import os
import pandas as pd
import regex as re
data_path = "/content/drive/My Drive/twint/Data/pkl/Data/"
for path in os.listdir(data_path):
my_tweets = []
df = pd.read_pickle(data_path + path)
for tweet in df.tweet:
url = re.findall(r"http\S+", tweet)
if url == []:
my_tweets.append(tweet)
new_df = pd.DataFrame({"tweets": my_tweets, "author": path.replace(".pkl", "")}) # path[:-4]
new_df.to_csv("/content/drive/My Drive/twint/final.csv", index=False, mode="a", )

Trying to use multiprocessing over files but freezes when I have the user input a path

I'm pretty new to Python, and I'm working on an assignment where multiprocessing is a requirement. I have a folder with a few .csv files in it and I want output of the form [file name, gradient, intercept] for each file in the folder. I can get this to work using a for loop, and the multiprocessing works when I put the file names manually into the code, like this:
import numpy
import os
import csv
from multiprocessing import Pool
files_txt = ['Data_set_1.csv','Data_set_2.csv','Data_set_3.csv','Data_set_4.csv']
print("files: ",files_txt)
lines = []
def GetLines(name):
f = open(name)
csv_f = csv.reader(f)
csv_x = []
csv_y = []
for row in csv_f:
try:
csv_x.append(float(row[0]))
csv_y.append(float(row[1]))
except ValueError:
pass
f.close()
x = numpy.array(csv_x)
y = numpy.array(csv_y)
m, b = numpy.polyfit(x, y, 1)
lines.append([name,m,b])
return lines
if __name__ == '__main__':
pool = Pool(processes = 2)
result = pool.map(GetLines,files_txt)
print(result)
This gives me my desired output, however when I change the first line to:
path = input("path: ")
files = os.listdir(path)
files_txt = [a for a in files if a.endswith(".csv")]
It prints out the names of my files in the folder and then just does nothing.
I have no idea how to remedy this and I've been googling all night.

Script to read multiple input file and writing multiple output file for each input file?

I have multiples input files in format like below which have to be processed.
Input file path /tmp/input.
1.1.1.txt
1.1.2.txt
1.1.3.txt
But, I want to have output files for each input file in another folder suppose (/tmp/outputsmgr) like below:
1.1.1_output.csv
1.1.2_output.csv
1.1.3_output.csv
The issues are:
Firstly, I am not able to write the output files in another/different folder
Secondly, all input files data after processing getting merged in one file in input folder only like below instead of separate output file for each input file
All the below files contains same data instead 1.1.1.txt data should be in file 1.1.1_output.csv and file 1.1.2.txt data should be in file 1.1.2_output.csv.
1.1.1.txt_output.csv
1.1.2.txt_output.csv
1.1.3.txt_output.csv
How can I modify the below code to get the desired result?
import os
import csv
import re
def parseFile(fileName):
# We are using a dictionary to store info for each file
data = list()
# data = dict()
fh = open(fileName, "r")
lines = fh.readlines()[1:]
for line in lines:
line = line.rstrip("\n")
if re.search("sessmgr", line):
splitted = line.split()
temp = dict()
temp["CPU"] = splitted[0]
temp["facility"] = splitted[1]
temp["instance"] = splitted[2]
temp["cpu-used"] = splitted[3]
temp["cpu-allc"] = splitted[4]
temp["mem-used"] = splitted[5]
temp["mem-allc"] = splitted[6]
temp["files-used"] = splitted[7]
temp["files-allc"] = splitted[8]
temp["sessions-used"] = splitted[9]
temp["sessions-allc"] = splitted[10]
# print (splitted[2])
data.append(temp)
# continue;
# print (data)
return data
if __name__ == "__main__":
inputsDirectory = "/tmp/input"
outputDirectory = "/tmp/outputsmgr"
path = os.path.abspath(inputsDirectory)
pathout = os.path.abspath(outputDirectory)
fileLists = ["{0}/{1}".format(path,x) for x in os.listdir(outputDirectory)]
fileList = ["{0}/{1}".format(path,x) for x in os.listdir(inputsDirectory)]
# print(fileList)
csvRows = []
for file in fileList:
newRow = parseFile(file)
csvRows.append(newRow)
# print(csvRows)
for files in fileList:
outputFile = "output.csv"
csvfile = open(os.path.join(files + "_" + outputFile), 'w')
fieldnames = ["CPU",
"facility",
"instance",
"cpu-used",
"cpu-allc",
"mem-used",
"mem-allc",
"files-used",
"files-allc",
"sessions-used",
"sessions-allc"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
headers = {}
for n in writer.fieldnames:
headers[n] = n
writer.writerow(headers)
# writer.writeheader()
for row in csvRows:
for obj in row:
print (obj)
writer.writerow(obj)
I think the code below will do what you want. It processes the files in the input directory sequentially and the results returned from the parseFile() function get written to the corresponding output file in the output directory. It's important to get a new set of csvRows from each input file and write (just) those to each output file.
The code assumes the outputDirectory already exists, but if that's not the case, then you'll need to add code to create it before processing any of the files. Hint: use os.path.exists() and os.path.isdir() in conjunction with os.makedirs().
import csv
import os
import re
def parseFile(filePath, fieldnames, target_re=r"sessmgr"):
""" Yield lines of file matching target regex. """
with open(filePath, "r") as file:
next(file) # Skip/ignore first line.
for line in file:
if re.search(target_re, line):
yield dict(zip(fieldnames, line.split()))
if __name__ == "__main__":
OUTPUT_FILE_SUFFIX = "output.csv"
inputsDirectory = "/tmp/input"
outputDirectory = "/tmp/outputsmgr"
fieldnames = ("CPU", "facility", "instance", "cpu-used", "cpu-allc", "mem-used",
"mem-allc", "files-used", "files-allc", "sessions-used",
"sessions-allc")
input_dir = os.path.abspath(inputsDirectory)
output_dir = os.path.abspath(outputDirectory)
for in_filename in os.listdir(input_dir):
in_filepath = os.path.join(input_dir, in_filename)
print('in_filepath: "{}"'.format(in_filepath))
in_rootname = os.path.splitext(in_filename)[0]
out_filename = in_rootname + "_" + OUTPUT_FILE_SUFFIX
out_filepath = os.path.join(output_dir, out_filename)
print('out_filepath: "{}"'.format(out_filepath))
with open(out_filepath, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(parseFile(in_filepath, fieldnames))

Only portions of data are being written to a cvs file, rest is missing

I am parsing through many xml files and putting certain information into a csv file. Because my xml files are named: "1.xml", "2.xml", etc... I am using a for loop to cycle through my different Xml file titles. However, based on the range that I use on my for loop, my csv file contains different data. For example, when my for loop range is 1:200 my csv file includes info from my xml files 1 to 199. However, when I change my range to 1:300, my csv file only contains info for my xml files 217 to 249. The info actually stored on my csv file changes based on what I put in as my range for my for loop. Has anyone else had this error and do you have any solutions?
My code is below:
import xml.etree.ElementTree as ET
import csv
from pathlib import Path
# open a file for writing
data_labels = open('DataLabels.csv', 'w', newline='')
missing_files = open('MissingFiles.csv', 'w', newline = '')
# create the csv writer object
csvwriter = csv.writer(data_labels)
csvwriter2 = csv.writer(missing_files)
data_head = []
data = []
missingfiles = 0
missfiles = []
MediaId = "Media Id"
#data_head.append (MediaId)
Family = "Family"
#data_head.append (Family)
Species = "Species"
#data_head.append (Species)
Genus = "Genus"
Content = "Content"
ClassId = "ClassId"
#data_head.append (Genus)
data_head.append(MediaId)
# Family = member.find('Family').tag
data_head.append(Content)
data_head.append(ClassId)
data_head.append(Family)
# Species = member.find('Species').tag
data_head.append(Species)
# Genus = member.find('Genus').tag
data_head.append(Genus)
csvwriter.writerow(data_head)
for i in range (1, 190):
#print (i)
data = []
inputfilename = str(i)+ ".xml"
my_file = Path(inputfilename)
if my_file.is_file():
data_labels = open('DataLabels.csv', 'w', newline='')
tree = ET.parse(inputfilename)
root = tree.getroot()
MediaId = root [2].text
Content = root[4].text
ClassId = root[5].text
Family = root[6].text
Species = root[7].text
Genus = root[8].text
#print (vote)
#count = 0
#for Image in root.find('MediaId'):
#print (child.tag, child.attrib)
#name = child.find('MediaId').text
# print (Image.find ('MediaId').text)
##csvwriter.writerow (data_head)
#data = []
#if count == 0:
# print ("count is zero i'm in loop")
# MediaId = member.find('MediaId').tag
# count = count + 1
#else:
#MediaId = root.findall('MediaId').text
data.append(MediaId)
data.append (Content)
data.append (ClassId)
#Family = member.find('Family').text
data.append(Family)
#Species = member.find('Species').text
data.append(Species)
#Genus = member.find('Genus').text
data.append(Genus)
csvwriter.writerow(data)
data_labels.close()
#print (data)
else:
missingfiles = missingfiles +1
missfiles = []
missfiles.append(inputfilename)
csvwriter2.writerow(missfiles)
print ("missing", missingfiles, "files")
data_labels.close()
missing_files.close()
print ("done")
Open the csv in append mode ,else you are just overwriting the same file.
I think you need to divide your script in small readable functions.
First, you can create a function to parse a XML file:
import xml.etree.ElementTree as ET
def parse_xml_file(xml_path):
""" Parse an XML file and return the data. """
# type: (str) -> list
tree = ET.parse(xml_path)
root = tree.getroot()
return [
root[2].text,
root[4].text,
root[5].text,
root[6].text,
root[7].text,
root[8].text]
This function parse a XML file and return one record containing a list of values.
Then, you can create a function to iterate a list of XML files (existing files) dans populate the CSV file:
import csv
import io
import os
def populate_data_labels(xml_path_list, work_dir="."):
header = ["Media Id", "Family", "Species", "Genus", "Content", "ClassId"]
with io.open(os.path.join(work_dir, 'DataLabels.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in xml_path_list:
writer.writerow(parse_xml_file(xml_path))
This function use parse_xml_file() to extract each record.
You can create a function to log the missing files. You can use CSV format (or a simple text file):
def populate_missing_files(missing_files, work_dir="."):
header = ["Filename"]
with io.open(os.path.join(work_dir, 'MissingFiles.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in missing_files:
writer.writerow([os.path.basename(xml_path)])
Finally, you can write a function which search the XML files and call the previous functions:
def parse_work_dir(work_dir="."):
all_files = [os.path.join(work_dir, "{0}.xml".format(idx))
for idx in range(1, 190)]
existing_files = (path for path in all_files if os.path.exists(path))
populate_data_labels(existing_files, work_dir)
missing_files = (path for path in all_files if not os.path.exists(path))
populate_missing_files(missing_files, work_dir)
Usage:
parse_work_dir("/path/to/your/working/dir")

Categories

Resources