Pandas not updating CSV - python

Dataset:
https://github.com/Bene939/newsheadlinedatasets
With my program I am labeling my dataset of news headlines. It worked fine until today.
For some reason it won't write the csv file anymore. As far as I can see the data frame gets updated though.
At around 4469 rows of my csv it started to not overwrite the csv file. And then it did. And then didnt do it again until it stopped overwriting completely at row 4474. It worked fine until now and if I create a new csv it will overwrite it.
I am using Jupyter Notebook. Is there some kind of limit to this? The labeled dataset is around 300KB.
!pip install pandas
!pip install pathlib
import pandas as pd
from pathlib import Path
#takes data frame and file name & appends it to given csv
def append_df(df, file_name):
my_file = Path(file_name)
if my_file.exists():
print("Appending to existing file named " + file_name)
orig_df = pd.read_csv(file_name)
print("Old Data Frame: ")
print(orig_df)
new_df = pd.concat([orig_df, df], ignore_index=True).drop_duplicates()
print("New Data Frame: ")
print(new_df)
new_df.to_csv(file_name, index=False, header = True, encoding='utf-8-sig')
else:
print("Creating new file named" + file_name)
news_sentiment_df.to_csv(file_name, index=False, header = True, encoding='utf-8-sig')
#takes data frame and file name & overwrites given csv
def update_csv(df, file_name):
print("Overwriting " + file_name)
df.to_csv(file_name, index=False, header = True, encoding='utf-8-sig')
#shows sentence by sentence, labels it according to input and saves it in a new csv file
print("WARNING: EDITING CSV FILE WITH EXCEL MAY CORRUPT FILE\n")
file_name = "news_headlines.csv"
new_file = "news_headlines_sentiment.csv"
news_sentiment_df = pd.DataFrame(columns=["news", "sentiment"])
my_file = Path(file_name)
if my_file.exists():
df = pd.read_csv(file_name, encoding='utf-8-sig', error_bad_lines=False)
print("Loaded " + file_name)
for index, row in df.iterrows():
user_input = -1
range = [0, 1, 2]
while user_input not in range:
print("####################################################################")
print(row["news"])
try:
user_input = int(input("Negative: 0\nNeutral: 1\nPositive: 2\n"))
except ValueError as err:
print("\nPlease enter an Integer!\n")
pass
new_element = 0
#label sentiment according to input
if user_input == 0:
new_element = [row["news"], 0]
elif user_input == 1:
new_element = [row["news"], 1]
elif user_input == 2:
new_element = [row["news"], 2]
#save labeled sentence to new file
news_sentiment_df.loc[len(news_sentiment_df)] = new_element
append_df(news_sentiment_df, new_file)
#delete data point from original data frame
index_name = df[df["news"] == row["news"]].index
df.drop(index_name, inplace=True)
#update old csv file
update_csv(df, file_name)
else:
print("File not Found")

I was trying to add duplicates while using drop_duplicates function without noticing it

Related

Efficient way to combine multiple csv

I have over 100K CSV (total file size north of 150 GB) which I need to join. All have standard column names although the sequence of columns may not match and some csv have a few columns missing.
Now I just created a dataframe and kept concating the datframe from each csv in each iteration to have a standard dataframe containing all columns which I eventually intended to save as csv
I tried making a dataframe with 1000 sample csv and noticed as the dataframe size increased, the number of iteration dropped down from 10 to 1.5 per second which probably means that it would follow a similar trend if I got all-in with 100k csv thus taking days if not months to combine them.
Is there a better way of combining huge number of csv files?
Here is my code
df_t1 = pd.DataFrame()
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))])
if thisCSV.endswith('type1.csv'):
df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")
Possible improvement
Method 1: Using Pandas
#df_t1 = pd.DataFrame()
df_t1_lst = []
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
if thisCSV.endswith('type1.csv'):
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
#df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))]) --unnecessary to loop use next line instead
df["File Name"] = thisCSV # places thisCSV in every row
#df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True) # concat slow, append to list instead
df_t1_lst.append(df)
df_t1 = pd.concat(df_t1_lst, ignore_index=True) # Form dataframe from list (faster than pd.concat in loop)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")
Method 1a
Using Pandas to continuously append to CSV output file
import os
import pandas as pd
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
write_header = True
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
pathxl = r"C:\\Users\\darryl\\OneDrive\\Python"
outpath = pathxl + r"\\"
excelNames = ["test1_type1.csv", "test2_type1.csv"]
output_file = outpath + "df_t1.csv"
with open(output_file, "w") as ofile:
pass # create empty output file
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
df = pd.read_csv(input_file)
if df.shape[0] > 0:
df['File Name'] = thisCSV # Add filename
df = df.sort_index(axis = 1) # sort based upon colunn in ascending order
# Append to output file
df.to_csv(output_file, mode='a',
index = False,
header= write_header)
write_header = False # Only write header once
del df
Method 2: Binary Files
Reading/Writing binary and using memory-map should be faster.
from tqdm import tqdm
import os
import mmap
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
header = None
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
with open(outpath + "df_t1.csv", "wb") as ofile:
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
with open(input_file, "rb") as ifile:
print('file ', thisCSV)
# memory-map the file, size 0 means whole file
with mmap.mmap(ifile.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
text_iter = iter(mmap_obj.read().split(b'\n'))
if header is None:
header = next(text_iter)
header = header.rstrip() + str_to_bytes(SEPARATOR + "File Name\n")
ofile.write(header) # write header
else:
next(text_iter) # ignore header row
# write data to output file
file_value = str_to_bytes(SEPARATOR + f"{thisCSV}\n")
for line in text_iter:
if line.strip(): # skip blank lines
ofile.write(line.rstrip() + file_value)

Problem in reading mu(μ) character in python

I have one input file in which there is one row where multiple mu(μ) characters are there. Python code just open the file and does some manipulation and we save that file in .csv format. When I save that file in .csv it is producing some weird and funny characters (�). The attached images show the input file and output files when I open in Excel.
Input CSV file:
Output CSV file:
from pathlib import Path
import pandas as pd
import time
import argparse
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('path',
help='define the directory to folder/file')
start = time.time()
def main(path_files):
rs_columns = "SourceFile,RowNum,SampleID,Method,Element,Result".split(",")
rs = pd.DataFrame(columns=rs_columns)
if path_files.is_file():
fnames = [path_files]
else:
fnames = list(Path(path_files).glob("*.csv"))
for fn in fnames:
if "csv" in str(fn):
#df = pd.read_csv(str(fn))
df = pd.read_csv(str(fn), header=None, sep='\n')
df = df[0].str.split(',', expand=True)
else:
print("Unknown file", str(fn))
non_null_columns = [col for col in df.columns if df.loc[:, col].notna().any()]
# loop thru each column for the whole file and create a row of results in the output file
for i in range(1,len(non_null_columns)):
SourceFile = Path(fn.name)
Method = "WetScreening"
Element = df.iloc[1,i]
print(Element)
for j in range(2,len(df)):
RowNum = j+1
Result = df.iloc[j,i]
SampleID = df.iloc[j,0]
rs = rs.append(pd.DataFrame({
"SourceFile": [SourceFile],
"RowNum": [RowNum],
"SampleID": [SampleID],
"Method": [Method],
"Element": [Element],
"Result": [Result]
}),ignore_index=True)
rs.to_csv("check.csv",index=False)
print("Output: check.csv")
if __name__== "__main__":
start = time.time()
args = parser.parse_args()
path = Path(args.path)
main(path)
print("Processed time: ", time.time()-start)
Attach files here
Any help????
Try encoding to utf-8:
rs.to_csv("check.csv",index=False, encoding='UTF-8')
See also Pandas df.to_csv("file.csv" encode="utf-8") still gives trash characters for minus sign
That answer mentions the BOM bytes (0xEF, 0xBB, 0xBF) at the start of the file that acts as a utf-8 signature.
rd.to_csv('file.csv', index=False, encoding='utf-8-sig')

how to convert folder of pickle files into single csv file

I have a directory containing about 1700 pickle file, that every file is all Twitter post of the user, I want to convert it into a folder of CSV files, that every CSV file name is the name of the pickle file and each row contains one tweet of user...
after that, I want just the top 20 CSV with more samples than others... how can I do that?
# khabarlist = open_file_linebyline(pkl_path)
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
# if len(file.name.split()) > 1:
# continue
# if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf8') as f:
items = [i.strip() for i in f.read().split(",")]
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
open_dir_in_dict("Raw/")
I Wrote the sample code for it and it did not work...
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
if len(file.name.split()) > 1:
continue
if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf-8', errors='replace') as f:
print(f.readlines())
items = [i.strip() for i in f.read().split(",")] # encode('utf-8').strip()
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
# open_dir_in_dict("Raw/")
and a better answer...
import os
import pandas as pd
import regex as re
data_path = "/content/drive/My Drive/twint/Data/pkl/Data/"
for path in os.listdir(data_path):
my_tweets = []
df = pd.read_pickle(data_path + path)
for tweet in df.tweet:
url = re.findall(r"http\S+", tweet)
if url == []:
my_tweets.append(tweet)
new_df = pd.DataFrame({"tweets": my_tweets, "author": path.replace(".pkl", "")}) # path[:-4]
new_df.to_csv("/content/drive/My Drive/twint/final.csv", index=False, mode="a", )

Print Unknown Length of Table to a CSV file using Python/Pandas

I have table I want to print out to a csv file using pandas. This table was extracted from a different excel file. The problem I have is that the table length is unknown. How should I print out this table to the csv file to show all of it and not on just on one line
for x in ABC:
print()
print(f"{x}:")
try:
df = pd.read_csv(x + "/File.csv")
df_filter= df[['A','B','C', "D", "E"]]
if df_filter['D'].str.contains('Fail').any():
noSC= df_filter[df_filter.DUTId != 'SC_INFO']
finalTable= noSC[noSC.D == 'Fail']
if finalTable.empty:
print("Did not complete")
sheet1['A16'] = finalTable
else:
filterTable= finalTable[['A','B','C', "E"]]
fullfinalTable = filterTable.to_string()
print(fullfinalTable)
else:
print("Run Successful")
except FileNotFoundError:
print("File does not exist")
I know that sheet1['A16'] = finalTable is wrong, but I unsure what I would do instead of that. It does output the table but only on A16, so it is a long line. Is there anyway to have the unknown table formatted into the new excel?
Try this instead.
from pathlib import Path
import pandas as pd
dir_path = r"yourFolderPath"
files_list = [str(p) for p in dir_path.glob("**/*.csv")]
if files_list:
source_dfs = [pd.read_csv(file_) for file_ in files_list]
df = pd.concat(source_dfs, ignore_index=True)
df = df[['A','B','C', "D", "E", "DUTId"]]
if df['D'].str.contains('Fail').any():
df = df_filter[df_filter.DUTId != 'SC_INFO']
finalTable = df[df.D == 'Fail']
if finalTable.empty:
print("Did not complete. Dataframe is empty.")
else:
print("Dataframe written to .csv")
finalTable = finalTable[['A','B','C','E']]
finalTable.to_csv(dir_path + r"/finaltable.csv")
else:
print(f"No .csv files in {dir_path}")

Split a dataframe into multiple data set based on condition and each sub set into Excel

Can someone please help me here? I do not get any output and I do not get either an error message. I am trying to filter a dataframe into multiple sub set using customer conditions and paste each sub set into Excel worksheets.
Master_data(df) Output A Output B
import pandas as pd
import os
## Belgium\2020\GMC Prep Automation")
from openpyxl import load_workbook
import xlsxwriter
from shutil import copyfile
file = input("please enter excelfile: ")
extension = os.path.splitext(file)[1]
filename = os.path.splitext(file)[0]
pth = "\\we.interbrew.net\\DFSEurope\\Crown Jewels\\Revenue Management\\WEST\\2. BE\\4. MPM Belgium\\2020\\GMC Prep Automation"
newfile = os.path.join(pth, filename+"_2"+extension)
#myfile = os.path.join(pth, Split_Test.xlsx)
df = pd.read_excel(file)
colpick = input("enter column to be splitted: ")
col = list(set(df[colpick].values))
def sendtoexcel(col):
copyfile(file, newfile)
for j in col:
writer = pd.ExcelWriter(newfile,engine='openpyxl')
for myname in col:
mydf=df.loc[df[colpick] == myname]
mydf.to_excel(writer,sheet_name=myname,index=False)
writer.save()
print("\nCompleted")
return
Assuming user inputs correct file names and existing column, consider groupby run and not a double for loop on the same column. Code is wrapped in try/except in case user enters an incorrect column name or some issue with exporting data fame to Excel.
from openpyxl import load_workbook
...
colpick = input("enter column to be splitted: ")
colpick = colpick.title().strip()
def sendtoexcel():
try:
with pd.ExcelWriter(file, engine='openpyxl') as writer:
writer.book = load_workbook(file)
for i, sub in df.groupby([colpick]):
sub.to_excel(writer, sheet_name=i, index=False)
writer.save()
except Exception as e:
print(e)
# ACTUALLY RUN FUNCTION
sendtoexcel()

Categories

Resources