Problem in reading mu(μ) character in python - python

I have one input file in which there is one row where multiple mu(μ) characters are there. Python code just open the file and does some manipulation and we save that file in .csv format. When I save that file in .csv it is producing some weird and funny characters (�). The attached images show the input file and output files when I open in Excel.
Input CSV file:
Output CSV file:
from pathlib import Path
import pandas as pd
import time
import argparse
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('path',
help='define the directory to folder/file')
start = time.time()
def main(path_files):
rs_columns = "SourceFile,RowNum,SampleID,Method,Element,Result".split(",")
rs = pd.DataFrame(columns=rs_columns)
if path_files.is_file():
fnames = [path_files]
else:
fnames = list(Path(path_files).glob("*.csv"))
for fn in fnames:
if "csv" in str(fn):
#df = pd.read_csv(str(fn))
df = pd.read_csv(str(fn), header=None, sep='\n')
df = df[0].str.split(',', expand=True)
else:
print("Unknown file", str(fn))
non_null_columns = [col for col in df.columns if df.loc[:, col].notna().any()]
# loop thru each column for the whole file and create a row of results in the output file
for i in range(1,len(non_null_columns)):
SourceFile = Path(fn.name)
Method = "WetScreening"
Element = df.iloc[1,i]
print(Element)
for j in range(2,len(df)):
RowNum = j+1
Result = df.iloc[j,i]
SampleID = df.iloc[j,0]
rs = rs.append(pd.DataFrame({
"SourceFile": [SourceFile],
"RowNum": [RowNum],
"SampleID": [SampleID],
"Method": [Method],
"Element": [Element],
"Result": [Result]
}),ignore_index=True)
rs.to_csv("check.csv",index=False)
print("Output: check.csv")
if __name__== "__main__":
start = time.time()
args = parser.parse_args()
path = Path(args.path)
main(path)
print("Processed time: ", time.time()-start)
Attach files here
Any help????

Try encoding to utf-8:
rs.to_csv("check.csv",index=False, encoding='UTF-8')
See also Pandas df.to_csv("file.csv" encode="utf-8") still gives trash characters for minus sign
That answer mentions the BOM bytes (0xEF, 0xBB, 0xBF) at the start of the file that acts as a utf-8 signature.
rd.to_csv('file.csv', index=False, encoding='utf-8-sig')

Related

The csv writer is writing some un-realistic values to the csv in python

In my code, the csv-writer is writing some un-realistic values to the CSV file.
My goal is to read all csv files in one directory and put filter on any specific column and write the filtered dataframe to a consolidated csv file.
I am able to get the outputs as required in the VS console, but I am not able to write them into a csv file.
Kindly help to understand what I am doing incorrect.
This is my sample input:
And this is the output I am getting:
Code:
import pandas as pd
import os
import glob
import csv
from pandas.errors import EmptyDataError
# use glob to get all the csv files
# in the folder
path = os.getcwd()
#print(path)
csv_files = glob.glob(os.path.join(path, "*.csv"))
print(csv_files)
col_name = input("Enter the column name to filter: ")
print(col_name)
State_Input = input("Enter the {} ".format(col_name) )
print(State_Input)
df_empty = pd.DataFrame()
for i in csv_files:
try:
df = pd.read_csv(i)
#print(df.head(5))
State_Filter = df["State"] == State_Input
print(df[State_Filter])
df_child = (df[State_Filter])
with open('D:\\PythonProjects\\File-Split-Script\\temp\\output\\csv_fil111.csv', 'w') as csvfile:
data_writer = csv.writer(csvfile, dialect = 'excel')
for row in df_child:
data_writer.writerows(row)
except EmptyDataError as e:
print('There was an error in your input, please try again :{0}'.format(e))
Use pd.to_csv to write your file at once. Prefer store your filtered dataframes into a list then concatenate all of them to a new dataframe:
import pandas as pd
import pathlib
data_dir = pathlib.Path.cwd()
# Your input here
state = input('Enter the state: ') # Gujarat, Bihar, ...
print(state)
data = []
for csvfile in data_dir.glob('*.csv'):
df = pd.read_csv(csvfile)
df = df.loc[df['State'] == state]]
data.append(df)
df = pd.concat(data, axis=1, ignore_index=True)
df.to_csv('output.csv', axis=0)

Efficient way to combine multiple csv

I have over 100K CSV (total file size north of 150 GB) which I need to join. All have standard column names although the sequence of columns may not match and some csv have a few columns missing.
Now I just created a dataframe and kept concating the datframe from each csv in each iteration to have a standard dataframe containing all columns which I eventually intended to save as csv
I tried making a dataframe with 1000 sample csv and noticed as the dataframe size increased, the number of iteration dropped down from 10 to 1.5 per second which probably means that it would follow a similar trend if I got all-in with 100k csv thus taking days if not months to combine them.
Is there a better way of combining huge number of csv files?
Here is my code
df_t1 = pd.DataFrame()
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))])
if thisCSV.endswith('type1.csv'):
df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")
Possible improvement
Method 1: Using Pandas
#df_t1 = pd.DataFrame()
df_t1_lst = []
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
if thisCSV.endswith('type1.csv'):
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
#df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))]) --unnecessary to loop use next line instead
df["File Name"] = thisCSV # places thisCSV in every row
#df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True) # concat slow, append to list instead
df_t1_lst.append(df)
df_t1 = pd.concat(df_t1_lst, ignore_index=True) # Form dataframe from list (faster than pd.concat in loop)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")
Method 1a
Using Pandas to continuously append to CSV output file
import os
import pandas as pd
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
write_header = True
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
pathxl = r"C:\\Users\\darryl\\OneDrive\\Python"
outpath = pathxl + r"\\"
excelNames = ["test1_type1.csv", "test2_type1.csv"]
output_file = outpath + "df_t1.csv"
with open(output_file, "w") as ofile:
pass # create empty output file
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
df = pd.read_csv(input_file)
if df.shape[0] > 0:
df['File Name'] = thisCSV # Add filename
df = df.sort_index(axis = 1) # sort based upon colunn in ascending order
# Append to output file
df.to_csv(output_file, mode='a',
index = False,
header= write_header)
write_header = False # Only write header once
del df
Method 2: Binary Files
Reading/Writing binary and using memory-map should be faster.
from tqdm import tqdm
import os
import mmap
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
header = None
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
with open(outpath + "df_t1.csv", "wb") as ofile:
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
with open(input_file, "rb") as ifile:
print('file ', thisCSV)
# memory-map the file, size 0 means whole file
with mmap.mmap(ifile.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
text_iter = iter(mmap_obj.read().split(b'\n'))
if header is None:
header = next(text_iter)
header = header.rstrip() + str_to_bytes(SEPARATOR + "File Name\n")
ofile.write(header) # write header
else:
next(text_iter) # ignore header row
# write data to output file
file_value = str_to_bytes(SEPARATOR + f"{thisCSV}\n")
for line in text_iter:
if line.strip(): # skip blank lines
ofile.write(line.rstrip() + file_value)

Run a python script on all files in a directory

First time posting a question here, hopefully, someone who experienced/tried this please share your insights... I've been working to get this far in the last few days and nights... now I am getting nowhere to loop this script on every file in a directory.
Bascially, these two scripts work perfectly fine it brings a pdf file and changes it to an excel workbook. Now what I need to do is going through all files from a selected directory and do the same job.
I am keep getting stuck at the opening the file stage - is this saying that the data (the pdf page - data[0]) cant be called in? or should i add more stages in to bring the dataset in...?
Do I have to create a list for the dataset so I can call in the data as you would have more than a data to call in.. is this why python can read the data[0] ???
Revised Script
# import
import os
import glob
import pdftotext
import openpyxl
from pathlib import Path
from string import ascii_uppercase
# open a pdf file
def to_excel(pdf_file):
with open(pdf_file,'rb') as f:
data = pdftotext.PDF(f)
# operate data to get titles, values
datas = data[0].split('\r\n')
finalData = list()
for item in datas:
if item != '':
finalData.append(item)
finalDataRefined = list()
for item in finalData:
if item != ' BCA Scheduled Maintenance Questions' and item != ' Do you suspect there is Asbestos at the property?' and item != ' Yes' and item != ' No' and item != '\x0c':
finalDataRefined.append(item.strip())
titles = list()
values = list()
for num, item in enumerate(finalDataRefined):
if num % 2 == 0:
titles.append(item)
else:
values.append(item)
# get an output file name
OPRAST = values[1]
filename = work_dir / f"{OPRAST}.xlxs"
# create an excel workbook
excel_file = openpyxl.Workbook()
excel_sheet = excel_file.active
excel_sheet.append([])
alphaList = list(ascii_uppercase)
for alphabet in alphaList:
excel_sheet.column_dimensions[alphabet].width = 20
excel_sheet.append(titles)
excel_sheet.append(values)
# save the excel workbook
excel_file.save(filename)
excel_file.close
# run a python script every file in a directory
alphaList = list(ascii_uppercase)
work_dir = Path(r"C:\Users\Sunny Kim\Downloads\Do Forms")
for pdf_file in work_dir.glob("*.pdf"):
to_excel(pdf_file)
I basically know what you want to do, but your code's indent is not so readable... especially it's python.
Your goal is to create a excel for each pdf file in you prefix dir? or aggregate all the pdf files together to a single excel file?
The follow coding is for the first goal.
Code logic.
get all the pdf file
loop over all the pdf file, for each:
open pdf file
some operation
export to excel file
You full code maybe like this(just guess):
# ----------------import part-------------------
import os
import glob
import pdftotext
import openpyxl
from string import ascii_uppercase
from pathlib import Path
def to_excel(pdf_file):
with open(pdf_file, 'rb') as f: # this open the pdf file
data = pdftotext.PDF(f)
# ---------------operate the data, get title and value-----------
datas = data[0].split('\r\n')
finalData = list()
for item in datas:
if item != '':
finalData.append(item)
finalDataRefined = list()
for item in finalData:
if item != ' BCA Scheduled Maintenance Questions' and item != ' Do you suspect there is Asbestos at the property?' and item != ' Yes' and item != ' No' and item != '\x0c':
finalDataRefined.append(item.strip())
titles = list()
values = list()
for num, item in enumerate(finalDataRefined):
if num % 2 == 0:
titles.append(item)
else:
values.append(item)
# ------------------get output file name---------------------
OPRAST = values[1]
filename = work_dir / f"{OPRAST}.xlxs"
# ------------------create excel file sheet------------------
excel_file = openpyxl.Workbook()
excel_sheet = excel_file.active
excel_sheet.append([])
alphaList = list(ascii_uppercase)
for alphabet in alphaList:
excel_sheet.column_dimensions[alphabet].width = 20
excel_sheet.append(titles)
excel_sheet.append(values)
# --------------------save----------------
excel_file.save(filename)
excel_file.close
# -------------------main program---------------
alphaList = list(ascii_uppercase)
work_dir = Path(r"C:\Users\Sunny Kim\Downloads\Do Forms")
for pdf_file in work_dir.glob("*.pdf"):
to_excel(pdf_file)

how to convert folder of pickle files into single csv file

I have a directory containing about 1700 pickle file, that every file is all Twitter post of the user, I want to convert it into a folder of CSV files, that every CSV file name is the name of the pickle file and each row contains one tweet of user...
after that, I want just the top 20 CSV with more samples than others... how can I do that?
# khabarlist = open_file_linebyline(pkl_path)
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
# if len(file.name.split()) > 1:
# continue
# if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf8') as f:
items = [i.strip() for i in f.read().split(",")]
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
open_dir_in_dict("Raw/")
I Wrote the sample code for it and it did not work...
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
if len(file.name.split()) > 1:
continue
if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf-8', errors='replace') as f:
print(f.readlines())
items = [i.strip() for i in f.read().split(",")] # encode('utf-8').strip()
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
# open_dir_in_dict("Raw/")
and a better answer...
import os
import pandas as pd
import regex as re
data_path = "/content/drive/My Drive/twint/Data/pkl/Data/"
for path in os.listdir(data_path):
my_tweets = []
df = pd.read_pickle(data_path + path)
for tweet in df.tweet:
url = re.findall(r"http\S+", tweet)
if url == []:
my_tweets.append(tweet)
new_df = pd.DataFrame({"tweets": my_tweets, "author": path.replace(".pkl", "")}) # path[:-4]
new_df.to_csv("/content/drive/My Drive/twint/final.csv", index=False, mode="a", )

Script to read multiple input file and writing multiple output file for each input file?

I have multiples input files in format like below which have to be processed.
Input file path /tmp/input.
1.1.1.txt
1.1.2.txt
1.1.3.txt
But, I want to have output files for each input file in another folder suppose (/tmp/outputsmgr) like below:
1.1.1_output.csv
1.1.2_output.csv
1.1.3_output.csv
The issues are:
Firstly, I am not able to write the output files in another/different folder
Secondly, all input files data after processing getting merged in one file in input folder only like below instead of separate output file for each input file
All the below files contains same data instead 1.1.1.txt data should be in file 1.1.1_output.csv and file 1.1.2.txt data should be in file 1.1.2_output.csv.
1.1.1.txt_output.csv
1.1.2.txt_output.csv
1.1.3.txt_output.csv
How can I modify the below code to get the desired result?
import os
import csv
import re
def parseFile(fileName):
# We are using a dictionary to store info for each file
data = list()
# data = dict()
fh = open(fileName, "r")
lines = fh.readlines()[1:]
for line in lines:
line = line.rstrip("\n")
if re.search("sessmgr", line):
splitted = line.split()
temp = dict()
temp["CPU"] = splitted[0]
temp["facility"] = splitted[1]
temp["instance"] = splitted[2]
temp["cpu-used"] = splitted[3]
temp["cpu-allc"] = splitted[4]
temp["mem-used"] = splitted[5]
temp["mem-allc"] = splitted[6]
temp["files-used"] = splitted[7]
temp["files-allc"] = splitted[8]
temp["sessions-used"] = splitted[9]
temp["sessions-allc"] = splitted[10]
# print (splitted[2])
data.append(temp)
# continue;
# print (data)
return data
if __name__ == "__main__":
inputsDirectory = "/tmp/input"
outputDirectory = "/tmp/outputsmgr"
path = os.path.abspath(inputsDirectory)
pathout = os.path.abspath(outputDirectory)
fileLists = ["{0}/{1}".format(path,x) for x in os.listdir(outputDirectory)]
fileList = ["{0}/{1}".format(path,x) for x in os.listdir(inputsDirectory)]
# print(fileList)
csvRows = []
for file in fileList:
newRow = parseFile(file)
csvRows.append(newRow)
# print(csvRows)
for files in fileList:
outputFile = "output.csv"
csvfile = open(os.path.join(files + "_" + outputFile), 'w')
fieldnames = ["CPU",
"facility",
"instance",
"cpu-used",
"cpu-allc",
"mem-used",
"mem-allc",
"files-used",
"files-allc",
"sessions-used",
"sessions-allc"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
headers = {}
for n in writer.fieldnames:
headers[n] = n
writer.writerow(headers)
# writer.writeheader()
for row in csvRows:
for obj in row:
print (obj)
writer.writerow(obj)
I think the code below will do what you want. It processes the files in the input directory sequentially and the results returned from the parseFile() function get written to the corresponding output file in the output directory. It's important to get a new set of csvRows from each input file and write (just) those to each output file.
The code assumes the outputDirectory already exists, but if that's not the case, then you'll need to add code to create it before processing any of the files. Hint: use os.path.exists() and os.path.isdir() in conjunction with os.makedirs().
import csv
import os
import re
def parseFile(filePath, fieldnames, target_re=r"sessmgr"):
""" Yield lines of file matching target regex. """
with open(filePath, "r") as file:
next(file) # Skip/ignore first line.
for line in file:
if re.search(target_re, line):
yield dict(zip(fieldnames, line.split()))
if __name__ == "__main__":
OUTPUT_FILE_SUFFIX = "output.csv"
inputsDirectory = "/tmp/input"
outputDirectory = "/tmp/outputsmgr"
fieldnames = ("CPU", "facility", "instance", "cpu-used", "cpu-allc", "mem-used",
"mem-allc", "files-used", "files-allc", "sessions-used",
"sessions-allc")
input_dir = os.path.abspath(inputsDirectory)
output_dir = os.path.abspath(outputDirectory)
for in_filename in os.listdir(input_dir):
in_filepath = os.path.join(input_dir, in_filename)
print('in_filepath: "{}"'.format(in_filepath))
in_rootname = os.path.splitext(in_filename)[0]
out_filename = in_rootname + "_" + OUTPUT_FILE_SUFFIX
out_filepath = os.path.join(output_dir, out_filename)
print('out_filepath: "{}"'.format(out_filepath))
with open(out_filepath, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(parseFile(in_filepath, fieldnames))

Categories

Resources