how to convert folder of pickle files into single csv file - python

I have a directory containing about 1700 pickle file, that every file is all Twitter post of the user, I want to convert it into a folder of CSV files, that every CSV file name is the name of the pickle file and each row contains one tweet of user...
after that, I want just the top 20 CSV with more samples than others... how can I do that?
# khabarlist = open_file_linebyline(pkl_path)
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
# if len(file.name.split()) > 1:
# continue
# if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf8') as f:
items = [i.strip() for i in f.read().split(",")]
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
open_dir_in_dict("Raw/")
I Wrote the sample code for it and it did not work...

def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
if len(file.name.split()) > 1:
continue
if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf-8', errors='replace') as f:
print(f.readlines())
items = [i.strip() for i in f.read().split(",")] # encode('utf-8').strip()
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
# open_dir_in_dict("Raw/")

and a better answer...
import os
import pandas as pd
import regex as re
data_path = "/content/drive/My Drive/twint/Data/pkl/Data/"
for path in os.listdir(data_path):
my_tweets = []
df = pd.read_pickle(data_path + path)
for tweet in df.tweet:
url = re.findall(r"http\S+", tweet)
if url == []:
my_tweets.append(tweet)
new_df = pd.DataFrame({"tweets": my_tweets, "author": path.replace(".pkl", "")}) # path[:-4]
new_df.to_csv("/content/drive/My Drive/twint/final.csv", index=False, mode="a", )

Related

Python - Read files from folder and Write CSV file in format

import glob
import os
import csv
from collections import OrderedDict
#Remove output file if already exists. Resolve the append Issue
file_path = 'C:\\Users\\Desktop\\Cobol\\Outputs\\LOC3X.csv'
if os.path.isfile(file_path):
os.remove(file_path)
#
list_of_files = glob.glob('C:\\Users\\Desktop\\Cobol\\*.CBL') # Input files in Folder
Fields = ['Program Name', 'LinesofCode'] # to be displayed in output CSV file
# opening output csv file to write (Fields)
file_path = 'C:\\Users\\Desktop\\Cobol\\Outputs\\LOC3X.csv'
with open(file_path, 'a') as csvfile1:
csvwriter = csv.writer(csvfile1)
csvwriter.writerow(Fields)
csvfile1.close()
def process_files_loc(list_of_files):
for fileName in list_of_files:
with open(fileName) as i:
count = sum(1 for line in i)
my_dict = {i : count} #input filename and its lines of code
ordered_dict = OrderedDict() #using OrderedDict
print(ordered_dict)
# creating ordered dict from dict
ordered_dict = OrderedDict(my_dict)
print(ordered_dict)
# writing records of Program name and LinesofCode to output csv file
file_path = 'C:\\Users\\Desktop\\Cobol\\Outputs\\LOC3X.csv'
with open(file_path, 'a') as csvfile2:
csvwriter = csv.writer(csvfile2)
csvwriter.writerows(ordered_dict)
csvfile2.close()
process_files_loc(list_of_files)
Output in Teminal (Error):
PS C:\Users\Python-1> & C:/Users/AppData/Local/Programs/Python/Python310/python.exe c:/Users/Python-1/one.py
OrderedDict()
OrderedDict([(<_io.TextIOWrapper name='C:\\Users\\Desktop\\Cobol\\ABCDEFGH.CBL' mode='r' encoding='cp1252'>, 191)])
OrderedDict()
OrderedDict([(<_io.TextIOWrapper name='C:\\Users\\Desktop\\Cobol\\IJKLMNOP.CBL' mode='r' encoding='cp1252'>, 195)])
Actual output of file in Folder:
C:\Users\Desktop\Cobol\Outputs
Name Date Modified Type Size
LOC3X.csv 9/15/2022 time Comma Seperated 1KB
Problem: Script executed and Read 2 CBL files in the Folder, and created 1 CSV file in output folder. The output CSV file to have,
Program Name LinesofCode
ABCDEFGH.CBL 191
IJKLMNOP.CBL 195
However, the actual output lines in CSV file is,
Program Name LinesofCode
Try something like this:
import glob
import csv
import os
def process_files_loc(files):
res = []
for file in files:
with open(file) as f:
line_count = len([line.strip("\n") for line in f if line != "\n"])
res.append([os.path.basename(f.name), line_count])
return res
if __name__ == '__main__':
with open('C:\\Users\\Main\\Desktop\\test\\test.csv', 'w', newline='') as f:
csvwriter = csv.writer(f)
csvwriter.writerow(['Program Name', 'LinesofCode'])
csvwriter.writerows(process_files_loc(glob.glob('C:\\Users\\Main\\Desktop\\test\\*.PY')))
Result:
result
Regards,

Converting a list of txt files into a list of csv files with python

I have the rolling code to convert a single .txt file into a a single .csv file, but I need the code to iterate over a directory of .txt files and gives out a directory of the same .txt files but in .csv format.
import csv
textfile = 'X:/general/DavidOrgEcon/GSTT/text to csv/Group.txt'
outfile = 'X:/general/DavidOrgEcon/GSTT/text to csv/Group.csv'
with open(textfile, 'r') as csvfile:
In_text = csv.reader(csvfile, delimiter=':')
all_rows = []
row_dict = {}
count_row = 1
for row in In_text:
if len(row) > 0:
row_dict[row[0].strip()] = row[1].strip()
if count_row % 4 == 0:
all_rows.append(row_dict)
row_dict = {}
count_row += 1
print(all_rows)
keys = all_rows[0].keys()
print(keys)
with open(outfile, 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(all_rows)
So assuming you have your existing function
def text_to_csv(infilepath, outfilepath):
...
which can read a text file from infilepath and output the csv to outfilepath, then you can make a new function that takes two directories and calls it on every text file in the first:
import os
def convert_directory(in_dir, out_dir):
# Loop through every file in the directory
for filename in os.listdir(in_dir):
# Split the file name into a base portion and an extension
# e.g. "file.txt" -> ("file", ".txt")
base_name, extension = os.path.splitext(filename)
# If it is a text file, do the transformation
if extension == ".txt":
# Construct the name of the csv file to create
csv_filename = f"{base_name}.csv"
# Call your function with the full filepaths
text_to_csv(
os.path.join(in_dir, filename),
os.path.join(out_dir, csv_filename)
)
convert_directory(
"X:/general/DavidOrgEcon/GSTT/text to csv/input_dir",
"X:/general/DavidOrgEcon/GSTT/text to csv/output_dir",
)

Script to read multiple input file and writing multiple output file for each input file?

I have multiples input files in format like below which have to be processed.
Input file path /tmp/input.
1.1.1.txt
1.1.2.txt
1.1.3.txt
But, I want to have output files for each input file in another folder suppose (/tmp/outputsmgr) like below:
1.1.1_output.csv
1.1.2_output.csv
1.1.3_output.csv
The issues are:
Firstly, I am not able to write the output files in another/different folder
Secondly, all input files data after processing getting merged in one file in input folder only like below instead of separate output file for each input file
All the below files contains same data instead 1.1.1.txt data should be in file 1.1.1_output.csv and file 1.1.2.txt data should be in file 1.1.2_output.csv.
1.1.1.txt_output.csv
1.1.2.txt_output.csv
1.1.3.txt_output.csv
How can I modify the below code to get the desired result?
import os
import csv
import re
def parseFile(fileName):
# We are using a dictionary to store info for each file
data = list()
# data = dict()
fh = open(fileName, "r")
lines = fh.readlines()[1:]
for line in lines:
line = line.rstrip("\n")
if re.search("sessmgr", line):
splitted = line.split()
temp = dict()
temp["CPU"] = splitted[0]
temp["facility"] = splitted[1]
temp["instance"] = splitted[2]
temp["cpu-used"] = splitted[3]
temp["cpu-allc"] = splitted[4]
temp["mem-used"] = splitted[5]
temp["mem-allc"] = splitted[6]
temp["files-used"] = splitted[7]
temp["files-allc"] = splitted[8]
temp["sessions-used"] = splitted[9]
temp["sessions-allc"] = splitted[10]
# print (splitted[2])
data.append(temp)
# continue;
# print (data)
return data
if __name__ == "__main__":
inputsDirectory = "/tmp/input"
outputDirectory = "/tmp/outputsmgr"
path = os.path.abspath(inputsDirectory)
pathout = os.path.abspath(outputDirectory)
fileLists = ["{0}/{1}".format(path,x) for x in os.listdir(outputDirectory)]
fileList = ["{0}/{1}".format(path,x) for x in os.listdir(inputsDirectory)]
# print(fileList)
csvRows = []
for file in fileList:
newRow = parseFile(file)
csvRows.append(newRow)
# print(csvRows)
for files in fileList:
outputFile = "output.csv"
csvfile = open(os.path.join(files + "_" + outputFile), 'w')
fieldnames = ["CPU",
"facility",
"instance",
"cpu-used",
"cpu-allc",
"mem-used",
"mem-allc",
"files-used",
"files-allc",
"sessions-used",
"sessions-allc"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
headers = {}
for n in writer.fieldnames:
headers[n] = n
writer.writerow(headers)
# writer.writeheader()
for row in csvRows:
for obj in row:
print (obj)
writer.writerow(obj)
I think the code below will do what you want. It processes the files in the input directory sequentially and the results returned from the parseFile() function get written to the corresponding output file in the output directory. It's important to get a new set of csvRows from each input file and write (just) those to each output file.
The code assumes the outputDirectory already exists, but if that's not the case, then you'll need to add code to create it before processing any of the files. Hint: use os.path.exists() and os.path.isdir() in conjunction with os.makedirs().
import csv
import os
import re
def parseFile(filePath, fieldnames, target_re=r"sessmgr"):
""" Yield lines of file matching target regex. """
with open(filePath, "r") as file:
next(file) # Skip/ignore first line.
for line in file:
if re.search(target_re, line):
yield dict(zip(fieldnames, line.split()))
if __name__ == "__main__":
OUTPUT_FILE_SUFFIX = "output.csv"
inputsDirectory = "/tmp/input"
outputDirectory = "/tmp/outputsmgr"
fieldnames = ("CPU", "facility", "instance", "cpu-used", "cpu-allc", "mem-used",
"mem-allc", "files-used", "files-allc", "sessions-used",
"sessions-allc")
input_dir = os.path.abspath(inputsDirectory)
output_dir = os.path.abspath(outputDirectory)
for in_filename in os.listdir(input_dir):
in_filepath = os.path.join(input_dir, in_filename)
print('in_filepath: "{}"'.format(in_filepath))
in_rootname = os.path.splitext(in_filename)[0]
out_filename = in_rootname + "_" + OUTPUT_FILE_SUFFIX
out_filepath = os.path.join(output_dir, out_filename)
print('out_filepath: "{}"'.format(out_filepath))
with open(out_filepath, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(parseFile(in_filepath, fieldnames))

Python, how to get number of unique ids per file and store filename and unique ids for each file in csv file?

I am working on a project to count the unique commenters in a chat and store the file name and number of commenters of that chat in a csv for each file. However, the code I have now is opening all of the documents and counting all the commenters across the multiple files. So, instead of getting the individual unique commenters per file, it is counting all the commenters across the multiple files. There are 10 unique commenters across all the files, however, I need to be able to see the number of unique commenters for each file and store that data in the csv file(see Desired Output for csv file picture). I feel like I am very close but I am stuck. Can anyone help with this issue or suggest other methods in doing this?
import os, sys, json
from collections import Counter
import csv
filename=""
filepath = ""
jsondata = ""
dictjson = ""
commenterid = []
FName = []
UList = []
TextFiles = []
UCommenter = 0
def get_FilePathList():
for root, dirs, files in os.walk("/Users/ammcc/Desktop/"):
for file in files:
##Find File with specific ending
if file.endswith("chatinfo.txt"):
path = "/Users/ammcc/Desktop/"
##Get the File Path of the file
filepath = os.path.join(path,file)
##Get the Filename of the file ending in chatinfo.txt
head, filename = os.path.split(filepath)
##Append all Filepaths of files ending in chatinfo.txt to TextFiles array/list
TextFiles.append(filepath)
##Append all Filenames of files ending in chatinfo.txt to FName array/list
FName.append(filename)
def open_FilePath():
for x in TextFiles:
##Open each filepath in TextFiles one by one
open_file = open(x)
##Read that file line by line
for line in open_file:
##Parse the Json of the file into jsondata
jsondata = json.loads(line)
##Line not needed but, Parse the Json of the file into dictjson as Dictionary
dictjson = json.dumps(jsondata)
## if the field commenter is found in jsondata
if "commenter" in jsondata:
##Then, append the field ["commenter"]["_id"] **(nested value in the json)** into list commenterid
commenterid.append(jsondata["commenter"]["_id"])
##Get and count the unique ids for the commenter
Ucommenter = (len(set(commenterid)))
##Appended that unique count in UList
UList.append(Ucommenter)
## create or append to the Commenter.csv file
with open('Commenter.csv', 'a') as csvfile:
filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
##Write the individual filename and the unique commenters for that file
filewriter.writerow([filename, Ucommenter])
commenterid.clear()
##Issue: Not counting the commenters for each file and storing the filename and its specific number of commneters in csv.
##the cvs is being created but the rows in the csv is not generating correctly.
##Call the functions
get_FilePathList()
open_FilePath()
Current Output in csv file
Desired Output for csv file
Output after suggestion
Output and code after Nemanja Radojković solution:
**Correct output format, but still not counting the unique commenters per file.
import json, os
import pandas as pd
import numpy as np
from collections import Counter
TextFiles = []
FName = []
csv_rows = []
commenterid = []
unique_id = []
NC = []
for root, dirs, files in os.walk("/Users/ammcc/Desktop/"):
for file in files:
if file.endswith("chatinfo.txt"):
path = "/Users/ammcc/Desktop/"
filepath = os.path.join(path,file)
head, filename = os.path.split(filepath)
TextFiles.append(filepath)
FName.append(filename)
n_commenters = 0
with open(filepath) as open_file:
for line in open_file:
jsondata = json.loads(line)
if "commenter" in jsondata:
commenterid.append(jsondata["commenter"]["_id"])
list_set = set(commenterid)
unique_list = (list(list_set))
for x in list_set:
n_commenters += 1
commenterid.clear()
csv_rows.append([filename, n_commenters])
df = pd.DataFrame(csv_rows, columns=['FileName', 'Unique_Commenters'])
df.to_csv('CommeterID.csv', index=False)
Try this:
import json
import os
import pandas as pd
TextFiles = []
FName = []
csv_rows = []
for root, dirs, files in os.walk("/Users/ammcc/Desktop/"):
for file in files:
##Find File with specific ending
if file.endswith("chatinfo.txt"):
path = "/Users/ammcc/Desktop/"
##Get the File Path of the file
filepath = os.path.join(path,file)
##Get the Filename of the file ending in chatinfo.txt
head, filename = os.path.split(filepath)
##Append all Filepaths of files ending in chatinfo.txt to TextFiles array/list
TextFiles.append(filepath)
##Append all Filenames of files ending in chatinfo.txt to FName array/list
FName.append(filename)
n_commenters = 0
with open(filepath) as open_file:
##Read that file line by line
for line in open_file:
##Parse the Json of the file into jsondata
jsondata = json.loads(line)
## if the field commenter is found in jsondata
if "commenter" in jsondata:
n_commenters += 1
csv_rows.append([filename, n_commenters])
df = pd.DataFrame(csv_rows, columns=['filename', 'n_commenters'])
df.to_csv('some_filename.csv', index=False)

Only portions of data are being written to a cvs file, rest is missing

I am parsing through many xml files and putting certain information into a csv file. Because my xml files are named: "1.xml", "2.xml", etc... I am using a for loop to cycle through my different Xml file titles. However, based on the range that I use on my for loop, my csv file contains different data. For example, when my for loop range is 1:200 my csv file includes info from my xml files 1 to 199. However, when I change my range to 1:300, my csv file only contains info for my xml files 217 to 249. The info actually stored on my csv file changes based on what I put in as my range for my for loop. Has anyone else had this error and do you have any solutions?
My code is below:
import xml.etree.ElementTree as ET
import csv
from pathlib import Path
# open a file for writing
data_labels = open('DataLabels.csv', 'w', newline='')
missing_files = open('MissingFiles.csv', 'w', newline = '')
# create the csv writer object
csvwriter = csv.writer(data_labels)
csvwriter2 = csv.writer(missing_files)
data_head = []
data = []
missingfiles = 0
missfiles = []
MediaId = "Media Id"
#data_head.append (MediaId)
Family = "Family"
#data_head.append (Family)
Species = "Species"
#data_head.append (Species)
Genus = "Genus"
Content = "Content"
ClassId = "ClassId"
#data_head.append (Genus)
data_head.append(MediaId)
# Family = member.find('Family').tag
data_head.append(Content)
data_head.append(ClassId)
data_head.append(Family)
# Species = member.find('Species').tag
data_head.append(Species)
# Genus = member.find('Genus').tag
data_head.append(Genus)
csvwriter.writerow(data_head)
for i in range (1, 190):
#print (i)
data = []
inputfilename = str(i)+ ".xml"
my_file = Path(inputfilename)
if my_file.is_file():
data_labels = open('DataLabels.csv', 'w', newline='')
tree = ET.parse(inputfilename)
root = tree.getroot()
MediaId = root [2].text
Content = root[4].text
ClassId = root[5].text
Family = root[6].text
Species = root[7].text
Genus = root[8].text
#print (vote)
#count = 0
#for Image in root.find('MediaId'):
#print (child.tag, child.attrib)
#name = child.find('MediaId').text
# print (Image.find ('MediaId').text)
##csvwriter.writerow (data_head)
#data = []
#if count == 0:
# print ("count is zero i'm in loop")
# MediaId = member.find('MediaId').tag
# count = count + 1
#else:
#MediaId = root.findall('MediaId').text
data.append(MediaId)
data.append (Content)
data.append (ClassId)
#Family = member.find('Family').text
data.append(Family)
#Species = member.find('Species').text
data.append(Species)
#Genus = member.find('Genus').text
data.append(Genus)
csvwriter.writerow(data)
data_labels.close()
#print (data)
else:
missingfiles = missingfiles +1
missfiles = []
missfiles.append(inputfilename)
csvwriter2.writerow(missfiles)
print ("missing", missingfiles, "files")
data_labels.close()
missing_files.close()
print ("done")
Open the csv in append mode ,else you are just overwriting the same file.
I think you need to divide your script in small readable functions.
First, you can create a function to parse a XML file:
import xml.etree.ElementTree as ET
def parse_xml_file(xml_path):
""" Parse an XML file and return the data. """
# type: (str) -> list
tree = ET.parse(xml_path)
root = tree.getroot()
return [
root[2].text,
root[4].text,
root[5].text,
root[6].text,
root[7].text,
root[8].text]
This function parse a XML file and return one record containing a list of values.
Then, you can create a function to iterate a list of XML files (existing files) dans populate the CSV file:
import csv
import io
import os
def populate_data_labels(xml_path_list, work_dir="."):
header = ["Media Id", "Family", "Species", "Genus", "Content", "ClassId"]
with io.open(os.path.join(work_dir, 'DataLabels.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in xml_path_list:
writer.writerow(parse_xml_file(xml_path))
This function use parse_xml_file() to extract each record.
You can create a function to log the missing files. You can use CSV format (or a simple text file):
def populate_missing_files(missing_files, work_dir="."):
header = ["Filename"]
with io.open(os.path.join(work_dir, 'MissingFiles.csv'), 'w') as fd:
writer = csv.writer(fd)
writer.writerow(header)
for xml_path in missing_files:
writer.writerow([os.path.basename(xml_path)])
Finally, you can write a function which search the XML files and call the previous functions:
def parse_work_dir(work_dir="."):
all_files = [os.path.join(work_dir, "{0}.xml".format(idx))
for idx in range(1, 190)]
existing_files = (path for path in all_files if os.path.exists(path))
populate_data_labels(existing_files, work_dir)
missing_files = (path for path in all_files if not os.path.exists(path))
populate_missing_files(missing_files, work_dir)
Usage:
parse_work_dir("/path/to/your/working/dir")

Categories

Resources