Skip file if value is not in data using python - python

With my current code, I am trying to skip a csv file if it does not contain a value within the actual data that I am looking for.
basically if it has "PROD_NAME" as a column, then it looks for that string and replaces it with the second string in that statement, but the first file in my folder does not have this column name and so the script fails. I've looked into ways to skip but have only seen ways to skip based on the filename itself and not the data within a file not having the correct information. Any help would be appreciated. Thanks!
def worker(files):
filenames = glob.glob(dest_dir + '\\*.csv')
for filename in filenames:
my_file = Path(os.path.join(dest_dir, filename))
#read header
with open(filename) as f:
read_data = f.read()
header = read_data[:read_data.find('!1')]
idx = header.find('\n')
# read data
df1 = pd.read_csv(filename, skiprows=1, encoding='ISO-8859-1', nrows=1) # read column header only - to get the list of columns
dtypes = {}
for col in df1.columns:# make all columns text, to avoid formatting errors
dtypes[col] = 'str'
df1 = pd.read_csv(filename, dtype=dtypes, skiprows=1, encoding='ISO-8859-1', quotechar="'", delimiter='\t')
df1.loc[df1['PROD_NAME'].str.contains('NA_NRF'), 'PROD_NAME'] = 'FA_GUAR'
file_count += 1 # count the fil
worker(files)

Could you just add an if statement before your transformation
if 'PROD_NAME' in df1.columns:
df1.loc[df1['PROD_NAME'].str.contains('NA_NRF'), 'PROD_NAME'] = 'FA_GUAR'
file_count += 1 # count the fil

Related

Python Script: replacing values in CSV's

In my python script, I'm trying to read into csv files and if it has a column "PROD_NAME", it finds a value within that column and replaces it with another value. Currently, whenever I run the script, everything is going through the "try" clause and acts like it is working but when I look into the file itself, the values remain unchanged.. Nothing is hitting the "except" clause and the Command prompt prints replace for each file it supposedly changed.. any help would be appreciated. Thanks!
def worker():
filenames = glob.glob(dest_dir + '\\*.csv')
for filename in filenames:# this is loop over files***************************
my_file = Path(os.path.join(dest_dir, filename))
try:
with open(filename) as f:
# read data
df1 = pd.read_csv(filename, skiprows=1, encoding='ISO-8859-1') # read column header only - to get the list of columns
dtypes = {}
#print(filename, df1)
for col in df1.columns:# make all columns text, to avoid formatting errors
dtypes[col] = 'str'
df1 = pd.read_csv(filename, dtype=dtypes, skiprows=1, encoding='ISO-8859-1')
if 'PROD_NAME' in df1.columns:
df1 = df1.replace("NA_NRF", "FA_GUAR")
print("Replaced" + filename)
except:
if 'PROD_NAME' in df1.columns:
print(filename)
worker()
Original DF:
!4 PROD_NAME ENTRY_YEAR
* NA_NRF 2014
The NA_NRF is supposed to change to FA_GUAR
This should do the job:
with open(filename) as f:
df_before = pd.read_csv(f, sep=';')
for i in df_before.columns.values:
if i == "PROD_NAME":
df_after = df_before.replace("NA_NRF", "FA_GUAR")
df_after.to_csv(filename, index=False, sep=';')
else:
print("nothing to change")
When I added sep=';' it stopped giving me headaches about quotes...

Create csv file using python, where all the values are seperated after first spacing and creates one column

I need help to convert simple_line.txt file to csv file using the pandas library. However, I am unable to categorize image file where i want to create all the values after first space in one column.
Here is the file (sample_list.txt), listed row by row:
Image Label
doc_pres223.jpg Durasal
doc_pres224.jpg Tab Cefepime
doc_pres225.jpg Tab Bleomycin
doc_pres226.jpg Budesonide is a corticosteroid,
doc_pres227.jpg prescribed for inflammatory,
I want the csv file to be like-
enter image description here
txt_file = r"./example.txt"
csv_file = r"./example.csv"
separator = "; "
with open(txt_file) as f_in, open(csv_file, "w+") as f_out:
for line in f_in:
f_out.write(separator.join(line.split(" ", maxsplit=1)))
try this:
import pandas as pd
def write_file(filename, output):
df = pd.DataFrame()
lines = open(filename, 'r').readlines()
for l in range(1, len(lines)):
line = lines[l]
arr = line.split(" ", maxsplit=1)
image_line = arr[0]
label_line = arr[1].replace('\n', '')
df = df.append({'Image': image_line, 'Label': label_line}, ignore_index=True)
df.to_csv(output)
if __name__ == '__main__':
write_file('example.txt', 'example.csv')
If the filenames in column Image is always the same length, then you could just treat is as a fixed width file. So the first column would be 15 characters, and the rest is the second column. Then just add two empty columns and write it to a new file.
# libraries
import pandas as pd
# set filename
filename = "simple_line.txt"
# read as fixed width
df = pd.read_fwf(filename, header=0, widths=[15, 100])
# add 2 empty columns
df.insert(1, 'empty1', '')
df.insert(2, 'empty2', '')
# save as a new csv file
filenew = "output.csv"
df.to_csv(filenew, sep=';', header=True, index=False)

Split values in CSV that look like JSON

So I have a CSV file with a column called content. However, the contents in column look like it is based on JSON, and, therefore, house more columns. I would like to split these contents into multiple columns or extract the final part of it after "value". See picture below to see an example of the file. Any ideas how to get this? I would prefer using Python. I don't have any experience with JSON.
Using pandas you could do in a simpler way.
EDIT updated to handle the single quotes:
import pandas as pd
import json
data = pd.read_csv('test.csv', delimiter="\n")["content"]
res = [json.loads(row.replace("'", '"')) for row in data]
result = pd.DataFrame(res)
result.head()
# Export result to CSV
result.to_csv("result.csv")
my csv:
result:
This script will create a new csv file with the 'value' added to the csv as an additional column
(make sure that the input_csv and output_csv are different filenames)
import csv
import json
input_csv = "data.csv"
output_csv = "data_updated.csv"
values = []
with open(input_csv) as f_in:
dr = csv.DictReader(f_in)
for row in dr:
value = json.loads(row["content"].replace("'", '"'))["value"]
values.append(value)
with open(input_csv) as f_in:
with open(output_csv, "w+") as f_out:
w = csv.writer(f_out, lineterminator="\n")
r = csv.reader(f_in)
all = []
row = next(r)
row.append("value")
all.append(row)
i = 0
for row in r:
row.append(values[i])
all.append(row)
i += 1
w.writerows(all)

How to write the filename and rowcount in a csv in python?

I've been trying to make a CSV from a big list of another CSVs and here's the deal: I want to get the names of these CSV files and put them in the CSV that I want to create, plus, I also need the row count from the CSV files that I'm getting the names of, here's what I've tried so far:
def getRegisters(file):
results = pd.read_csv(file, header = None, error_bad_lines= False, sep = '\t', low_memory = False)
print(len(results))
return len(results)
path = "C:/Users/gdldieca/Desktop/TESTSFORPANW/New folder"
dirs = os.listdir(path)
with open("C:/Users/gdldieca/Desktop/TESTSFORPANW/New folder/FilesNames.csv", 'w', newline='') as f:
writer = csv.writer(f, delimiter = '\t')
writer.writerow(("File", "Rows"))
for names in dirs:
sfile = getRegisters("C:/Users/gdldieca/Desktop/TESTSFORPANW/New folder/" + str(names))
writer.writerow((names, sfile))
However I can't seem to get the files row count even tho Pandas actually returns it. I'm getting this error:
_csv.Error: iterable expected, not int
The final result would be something like this written into the CSV
File1 90
File2 10
If you are using pandas , I think you can use also for make a csv file with all values that you need..Here an alternative
import os
import pandas as pd
directory='D:\\MY\\PATH\\ALLCSVFILE\\'
#create a list for add all
rows_list = []
for filename in os.listdir(directory):
if filename.endswith(".csv"):
file=os.path.join(directory, filename)
df=pd.read_csv(file)
#Count rows
rowcount=len(df.index)
new_row = {'namefile':filename, 'count':rowcount}
rows_list.append(new_row)
#pass list to dataframe
df1 = pd.DataFrame(rows_list)
print(df1)
df1.to_csv('test.csv', sep=',')
result :

Removing a row from CSV with python if data wasn't recorded in a column

I'm trying to import a batch of CSV's into PostgreSQL and constantly run into an issue with missing data:
psycopg2.DataError: missing data for column "column_name" CONTEXT:
COPY table_name, line where ever in the CSV that data wasn't
recorded, and here are data values up to the missing column.
There is no way to get the complete set of data written to the row at times, and I have to deal with the files as is. I am trying to figure a way to remove the row if data wasn't recorded into any column. Here's what I have:
file_list = glob.glob(path)
for f in file_list:
filename = os.path.basename(f) #get the file name
arc_csv = arc_path + filename #path for revised copy of CSV
with open(f, 'r') as inp, open(arc_csv, 'wb') as out:
writer = csv.writer(out)
for line in csv.reader(inp):
if "" not in line: #if the row doesn't have any empty fields
writer.writerow(line)
cursor.execute("COPY table_name FROM %s WITH CSV HEADER DELIMITER ','",(arc_csv,))
You could use pandas to remove rows with missing values:
import glob, os, pandas
file_list = glob.glob(path)
for f in file_list:
filename = os.path.basename(f)
arc_csv = arc_path + filename
data = pandas.read_csv(f, index_col=0)
ind = data.apply(lambda x: not pandas.isnull(x.values).any(), axis=1)
# ^ provides an index of all rows with no missing data
data[ind].to_csv(arc_csv) # writes the revised data to csv
However, this could get slow if you're working with large datasets.
EDIT - added index_col=0 as an argument to pandas.read_csv() to prevent the added index column issue. This uses the first column in the csv as an existing index. Replace 0 with another column's number if you have reason not to use the first column as index.
Unfortunately, you cannot parameterize table or column names. Use string formatting, but make sure to validate/escape the value properly:
cursor.execute("COPY table_name FROM {column_name} WITH CSV HEADER DELIMITER ','".format(column_name=arc_csv))

Categories

Resources