Python- Flask CSV File upload error handling [duplicate] - python

This question already has an answer here:
Python: One Try Multiple Except
(1 answer)
Closed 4 years ago.
Essentially I am able to do what I want with my current code, which is upload a csv file, manipulate it with pandas and then update a MSQL Database. I would like to add error handling somehow. Essentially the upload function will only work for one particular file and throw different errors for all others.
Is there a way that I can catch multiple errors and return an error message to the user ?
Possibly something like a check on the input csv file column headers.
#app.route('/upload', methods =['GET', 'POST'])
def csv_input():
tempfile_path = tempfile.NamedTemporaryFile().name
#file.save(tempfile_path)
#sheet = pd.read_csv(tempfile_path)
if request.method == 'POST':
file = request.files['file']
if file: #and allowed_filename(file.filename):
#filename = secure_filename(file.filename)
file.save(tempfile_path)
input_csv = pd.read_csv(tempfile_path,sep=",", engine='python')
#### Data Cleansing From Uploded Data
col_titles = ['id','title','vote_average','w_average','vote_count','year','runtime',
'budget','revenue','profit']
# Only Keep Data where the Original Language is English
input_csv = input_csv[input_csv['original_language']=='en']
# New Dataframe that only contains data with vote count > 10
input_csv = input_csv[input_csv['vote_count'] >= 10]
# Fill all NA values to 0 - Needed to set datatypes
input_csv = input_csv.fillna(0)
# Remove all Rows with no Runtime
input_csv = input_csv[input_csv['runtime']!=0]
# Revmove all duplciate Rows
input_csv = input_csv.drop_duplicates()
input_csv['vote_average'] = input_csv.vote_average.astype(float).round(1)
input_csv.vote_average.round(1)
input_csv['runtime'] = input_csv.runtime.astype(int)
input_csv['vote_count'] = input_csv.vote_count.astype(int)
input_csv['revenue'] = input_csv.revenue.astype('int64')
input_csv['budget'] = input_csv.budget.astype('int64')
profit_cal(input_csv,'revenue','budget','profit')
input_csv['profit']=input_csv.profit.astype('int64')
input_csv['profit']=input_csv.profit.replace(0,'No Data')
#reorder_data = pd.DataFrame(input_csv)
# Year Cleaning
input_csv['year'] = pd.to_datetime(input_csv['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
#C = reorder_data['vote_average'].mean()
#m = reorder_data['vote_count'].quantile(0.10)
#w_average = org_data.copy().loc[reorder_data['vote_count'] >= m]
#### IMDB Data Calculation
V = input_csv['vote_count']
R = input_csv['vote_average']
C = input_csv['vote_average'].mean()
m = input_csv['vote_count'].quantile(0.10)
input_csv['w_average'] = (V/(V+m) * R) + (m/(m+V) * C)
#C = input_csv['vote_average'].mean()
#m = input_csv['vote_count'].quantile(0.10)
#input_csv['w_average'] = input_csv.apply(weighted_rating, axis = 1)
input_csv['w_average'] = input_csv.w_average.astype(float).round(1)
reorder_data = input_csv[col_titles]
reorder_data.to_sql(name='title_data', con=engine, if_exists = 'replace', index=False)
# Reorder the data and output in the correct order
##### Genre Loads == DataFrame 2
df = input_csv
v = df.genres.apply(json.loads)
df = pd.DataFrame(
{
'id' : df['id'].values.repeat(v.str.len(), axis=0),
'genre' : np.concatenate(v.tolist())
})
df['genre'] = df['genre'].map(lambda x: x.get('name'))
genre_data = df.genre.str.get_dummies().sum(level=0)
genre_data = df.loc[(df!=0).any(1)]
#genre_data = genre_data.set_index('id')
genre_order = ['id','genre']
## Dataframw to SQL
genre_data[genre_order].to_sql(name='genre_data', con=engine, if_exists = 'replace', index=False)
####### Keyword Search ### Dataframe
#genre_data.to_csv("genre_data.csv")
#return genre_data[genre_order].to_html()
flash('Database has been updated successfully','success')
#return reorder_data[col_titles].to_html()
#stream = io.StringIO(file.stream.read().decode("UTF8"), newline=None)
#csv_input = csv.reader(stream)
#return reorder_data.to_html(index=False)
#flash('File Uploaded Successfully')
#return redirect(url_for('index'))
return render_template('upload.html')

There are several methods:
The Python way, just add try: and except with the relevant exception classes.
try:
# parsing & processing logic here
pass
except EmptyDataError as ex:
# tell user we don't except empty data
pass
except ParserError as ex:
# tell user we failed to parse their input
pass
except Exception as ex:
# tell user that something went wrong.
pass
The Flask way, register error handlers with flask for specific exceptions (this effects the whole flask application):
#app.errorhandler(pandas.errors.EmptyDataError)
def handle_empty_data():
return 'Failed parsing Input', 200

Related

Alphabetically ordering Worksheets using openpxl in WriteOnlyWorkSheet mode

Background:I have created a script that takes a .xlsx file and splits the Groupings of data into separate worksheets, based on the names of each Grouping. E.g., Client A Grouped data will be written to a worksheet named Client A and so forth.Additionally, the script strips out Excel-invalid characters, and miscellaneous strings contained within INVALID_TITLE_CHARS and INVALID_TITLE_NAMES as well as uses values in management_style and model_type to decide which rows are irrelevant and therefore skipped, during the copy process.
My issue:I would like to expand the script so that the Worksheets are alphabetical. E.g., 'Client A, Client B, Client C, and so forth. I have tried to achieve this by client_sheet._sheets.sort(key=lambda output_workbook: output_workbook.title) however, for memory consistency reasons, I have used WriteOnlyWorksheet which seems incompatible with _sheets.Does anyone know if there is another solution? I was hoping to avoid creating a function that reopens the.xlsx
The script:This is the full script:
import time
import logging
import sys
import datetime as dt
import datetime
import requests as requests
import json
import enlighten
import warnings
from openpyxl import load_workbook
from openpyxl import LXML
from openpyxl.cell import WriteOnlyCell
from openpyxl import Workbook
from copy import copy
from configparser import ConfigParser
from requests.auth import HTTPBasicAuth
logger = logging.getLogger()
timestr = datetime.datetime.now().strftime("%Y-%m-%d")
warnings.filterwarnings("ignore")
def configure_logging():
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
INVALID_TITLE_CHARS = ["]", "[", "*", ":", "?", "/", "\\", "'"]
INVALID_TITLE_CHAR_MAP = {ord(x): "" for x in INVALID_TITLE_CHARS}
INVALID_TITLE_NAMES = ["zz_ FeeRelationship", "Family"]
def clean_sheet_title(title):
title = title or ""
title = title.strip()
title = title.translate(INVALID_TITLE_CHAR_MAP)
for name in INVALID_TITLE_NAMES:
title = title.replace(name, "")
return title[:31]
def is_client_row(row, row_dimension):
return row_dimension.outlineLevel == 0
def create_write_only_cell(source_cell, target_sheet):
target_cell = WriteOnlyCell(target_sheet, value=source_cell.value)
target_cell.data_type = source_cell.data_type
if source_cell.has_style:
target_cell.font = copy(source_cell.font)
# target_cell.border = copy(source_cell.border)
# target_cell.fill = copy(source_cell.fill)
target_cell.number_format = copy(source_cell.number_format)
# target_cell.protection = copy(source_cell.protection)
target_cell.alignment = copy(source_cell.alignment)
return target_cell
def create_write_only_row(source_row, target_sheet):
return [create_write_only_cell(cell, target_sheet) for cell in source_row]
def skip_row(row, row_dimension):
"""
Determine whether a row needs to be skipped and not copied to new workbook
"""
def get_column_value(column):
value = row[column].value or ""
return value.strip()
# skip total line
if row[0].value == "Total":
return True
management_style = [
"Advisory",
"Advisory - No Fee",
"Holding",
"JPAS",
"Liquidity Management",
"Trading",
"",
]
model_type = ["Client", "Holding Account", "Holding Company", "Trust", ""]
management_value = get_column_value(3)
model_value = get_column_value(11)
# Pass on either column
return management_value not in management_style and model_value not in model_type
# # Pass on both columns
# return management_value not in management_style or model_value not in model_type
def split_workbook(input_file, output_file):
"""
Split workbook each client into its own sheet.
"""
try:
logger.info(f"Loading workbook {input_file}")
workbook = load_workbook(input_file)
data_sheet = workbook.active
output_workbook = Workbook(write_only=True)
client_sheet = None
client_row_index = 2
processing_client = 0
skip_child = False
skipped_parent_outline_level = 0
skipped_rows_per_client = 0
rows = data_sheet.rows
header = next(rows)
for index, row in enumerate(rows, start=2):
row_dimension = data_sheet.row_dimensions[index]
# verify whether current row is a child of skipped parent
if skip_child and skipped_parent_outline_level < row_dimension.outlineLevel:
skipped_rows_per_client += 1
continue
# reset skip_child when current row is not a child of skipped parent anymore
if (
skip_child
and skipped_parent_outline_level >= row_dimension.outlineLevel
):
skip_child = False
# check whether row needs to be skipped
if skip_row(row, row_dimension):
skipped_rows_per_client += 1
skip_child = True
skipped_parent_outline_level = row_dimension.outlineLevel
continue
# create new sheet found new client is found
if is_client_row(row, row_dimension):
skipped_rows_per_client = 0
processing_client += 1
client_sheet_title = clean_sheet_title(row[0].value)
logger.info(f"Processing client {processing_client}")
client_sheet = output_workbook.create_sheet(client_sheet_title)
client_row_index = index
# copy column dimensions
for key, column_dimension in data_sheet.column_dimensions.items():
client_sheet.column_dimensions[key] = copy(column_dimension)
client_sheet.column_dimensions[key].worksheet = client_sheet
client_sheet.append(create_write_only_row(header, client_sheet))
# copy row dimensions
new_row_index = index - skipped_rows_per_client - client_row_index + 2
client_sheet.row_dimensions[new_row_index] = copy(row_dimension)
client_sheet.row_dimensions[new_row_index].worksheet = client_sheet
# finally copy row
client_sheet.append(create_write_only_row(row, client_sheet))
if index % 10000 == 0:
logger.info(f"{index} rows processed")
logger.info(f"Writing workbook {output_file}")
output_workbook.save(output_file)
finally:
if workbook:
workbook.close()
if output_workbook:
output_workbook.close()
if __name__ == "__main__":
start = time.time()
configure_logging()
input_file = 'Input_File'+timestr+'.xlsx'
output_file = 'Output_File'+timestr+'.xlsx'
logger.info(f"Using lxml mode: {LXML}")
split_workbook(input_file, output_file)
logger.info("Time consumed: % s seconds" % (time.time() - start))
I am able to achieve this using _sheets.sort() the titles. Here is an example to demonstrate the same...
import openpyxl
from openpyxl.cell import WriteOnlyCell
#Create workbook in write_only mode as you did
output_workbook = Workbook(write_only=True)
#Add 4 sheets
ws1 = output_workbook.create_sheet("clientB_sheet_title")
ws2 = output_workbook.create_sheet("clientA_sheet_title")
ws3 = output_workbook.create_sheet("clientD_sheet_title")
ws4 = output_workbook.create_sheet("clientC_sheet_title")
#Add some data at start
cell = WriteOnlyCell(ws1, value="hello clientB")
ws1.append([cell, 3.14, None])
cell = WriteOnlyCell(ws2, value="hello clientA")
ws2.append([cell, 3.14, None])
cell = WriteOnlyCell(ws3, value="hello clientD")
ws3.append([cell, 3.14, None])
cell = WriteOnlyCell(ws4, value="hello clientC")
ws4.append([cell, 3.14, None])
### The key to your question - The sorting of titles ###
output_workbook._sheets.sort(key=lambda ws: ws.title)
#Finally save
output_workbook.save("output_file.xlsx")
Output excel
OPTION - 2 (better and safer option) using move_sheet()
As per Charlie Clark's recommendation, the better and safer option is to use move_sheets(). The same has been added. Note that I am only including the code that will replace the last two commands (_sheet and save). Results are the same...
asc_sheetlist = output_workbook.sheetnames
asc_sheetlist.sort()
for pos, name in enumerate(asc_sheetlist):
output_workbook.move_sheet(name, pos - output_workbook.sheetnames.index(name))
output_workbook.save("output_file.xlsx")
Based on Redox's answer, I was able to throw together a super-simple function to achieve this -
def alphabetical_client_sheet_sort(output_file):
workbook = load_workbook(output_file)
workbook._sheets.sort(key=lambda output_file: output_file.title)
workbook.save(output_file)
workbook.close()

Reading more than one dataframe from a function

I wrote these codes to read Datastes:
import pandas as pd
import pathlib
def load_coordinates(structure,segments:None):
filedir = pathlib.Path(__file__).parent.parent / "data" / structure
filepath = filedir / f"coordinates_{structure}.csv"
return pd.read_csv(filepath, sep=";")
if segments:
return df.loc[df.segment.isin(segments)].reset_index(drop=True)
else:
return df
def load_population(structure,segments:None):
filedir = pathlib.Path(__file__).parent.parent / "data" / structure
filepath = filedir / f"population_{structure}.csv"
return pd.read_csv(filepath, sep=";")
return df.loc[df.segment.isin(segments)].reset_index(drop=True)
else:
return df
def load_ambul_praxen(structure,segments=None):
filedir = pathlib.Path(__file__).parent.parent / "data" / structure
filepath = filedir / f"ambul_praxen_{structure}.csv"
return pd.read_csv(filepath, sep=";").drop(columns=["planet"])
return df.loc[df.segment.isin(segments)].reset_index(drop=True)
else:
return df
def load_docs_matrix(structure, docs_selection, sum_up_vo):
filedir = pathlib.Path(__file__).parent.parent / "data" / structure
filepath = filedir / f"docs_matrix_{structure}.csv"
return pd.read_csv(filepath, sep=";")
selected_cols = ["segment"] + docs_selection
df = df.loc[:, selected_cols]
if sum_up_vo:
df["vo"] = df[docs_selection].sum(axis=1)
df = df.drop(columns=docs_selection)
return df
def load_weights(structure, request, segments=None, docs_selection=None, sum_up_vo=None):
if request == "population":
return load_population(structure)
elif request == "ambul_praxen":
return load_ambul_praxen(structure)
else:
return load_docs_matrix(structure)
def load_data(structure, request, segments=None, docs_selection=None,
sum_up_vo=None):
coordinates = load_coordinates(structure)
weights = load_weights(structure, request, segments, docs_selection, sum_up_vo)
return coordinates.merge(weights, on="segment", how="inner")
before calling the function, I want to add this:
if segments:
return df.loc[df.segment.isin(segments)].reset_index(drop=True)
else:
return df
to def load_docs_matrix function
I called the function like this:
request = "ambul_praxen"
structure = "1868"
load_data(structure,request,
load_coordinates,
load_ambul_praxen,
load population,
load_weights)
What I want to achieve is put all the datasets in one dataframe with the load data function.after calling the function , I got an error that I gave 7 arguments but 5 is expected.I tried to adjust the arguments but it seems not to work.
Any idea on how I could solve this?
I'm not sure I understood the question, but if you just want to call these reading dataframe's functions, just:
def load_data(input_df, structure, request):
coordinates = load_coordinates(structure)
population = load_population(structure)
ambul_praxen = load_ambul_praxen(structure)
docs_matrix = load_docs_matrix(structure)
weighs = load_weights(structure,request)
#Maybe concatenate them horizontally?
return pd.concat([coordinates,population,ambul_praxen,docs_matrix,weighs],axis=1)
Or just return them one by one...

Django ORM create rows dynamically with _meta.get_fields()

I am writing a function to read xlsx and write this data to the database with Django. But I have so many fields that I cannot define statically. I want to write these fields to the database with a certain algorithm. When designing this, I get the class attributes with _meta.get_fields(). But as seen in this example, attributes always remain None. In views.py there is an example between BEGIN and END codes. How can i solve this problem?
views.py
# ... some codes ...
#login_required(login_url='/admin/login/')
def import_process(request):
if request.method == 'POST':
form = UploadFileForm(request.POST, request.FILES)
if form.is_valid():
file_in_memory = request.FILES['file'].read()
wb = load_workbook(filename=BytesIO(file_in_memory), data_only=True)
ws = wb.worksheets[0]
ws_company_name = str(ws["D9"].value)
ws_company_address = str(ws["D10"].value)
# ... some codes ...
# Company Tower Get ID loop
company_staff = CompanyStaff.objects.filter(company_id=Company.objects.filter(company_name=ws_company_name)[0].id)[0]
for col in COLUMN_PARAMETER_LIST:
column_cell = str(col) + str(COLUMN_PARAMETER_BEGIN)
ws_tower_type = str(ws[column_cell].value)
tower_type = TowerType.objects.filter(tower_type=ws_tower_type)[0]
c_tower_object = CompanyTower.objects.filter(company_staff_id=company_staff,tower_type_id=tower_type)[0]
tower_data = TowerData.objects.create(company_tower_id=c_tower_object)
ROW_IDX = int(ROW_PARAMETER_BEGIN-1)
# ****************** BEGIN ****************** #
site_fields = tower_data._meta.get_fields()
site_fields_names = [f.name for f in site_fields][1:]
for mfield in site_fields_names:
if any(word in str(mfield) for word in TOWER_DATA_EXCLUDE_FIELDS_NEW):
continue
else:
ROW_IDX += 1
tower_data_cell = str(col)+str(ROW_IDX)
tower_data_cell_value = ws[tower_data_cell].value
tower_data.mfield = tower_data_cell_value
if str(mfield) == 'ph': # Example Field
print(tower_data.mfield)
print(tower_data.ph)
# ******************* END ******************* #
tower_data.save()
print("****************")
return render(request, template_name='import.html',context={"form": UploadFileForm()})
# ... some codes ...
You're currently setting and replacing an attribute called mfield over and over: tower_data.mfield = tower_data_cell_value.
What you want is setattr(tower_data, mfield, tower_data_cell_value)

Reduce memory usage by json.loads with multiprocessing or Dask in Python

I have a csv file with 1 million rows and 3gb data size. I used panda read_csv to convert it to DataFrame and it works well.
Next now i have to format the data columns and append another column also according to the value of some columns. To do this, i am using Dask DataFrame npartitions then apply row-wise. We have 7.5gb of RAM at our instance, but it hangs and kill the process with MemoryError.
This is my code to format the data columns:
import pandas as pd
import json
import dask.dataframe as dd
import multiprocessing
def formatting_data(data):
print("cleaning and formatting data")
data["IsBadJson"] = False
data["BadJsonStr"] = None
data = dd.from_pandas(data, npartitions=4*multiprocessing.cpu_count())
.map_partitions(lambda df: df.apply(lambda row: parse_data_to_json(row), axis=1))
.compute(scheduler='processes')
return data
Below is code for function parse_data_to_json we are using for formatting
def parse_data_to_json(x):
try:
if x.get("RequestSent") == "nan":
x["RequestSent"] = None
x["IsBadJson"] = True
x["BadJsonStr"] = str(x.get("RequestSent"))
else:
x["RequestSent"] = json.loads(x.get("RequestSent"))
x["IsBadJson"] = False
x["BadJsonStr"] = None
except Exception as error:
print("Found an error value in Tax Json field RequestSent: {}, error details: {}".format(x.get("RequestSent"), error))
print("{}-{}-{}".format(None, True, str(x.get("RequestSent"))))
x["RequestSent"] = None
x["IsBadJson"] = True
x["BadJsonStr"] = str(x.get("RequestSent"))
try:
if x.get("ResponseReceived") == "nan":
x["ResponseReceived"] = None
x["IsBadJson"] = True
x["BadJsonStr"] = str(x.get("ResponseReceived"))
else:
x["ResponseReceived"] = json.loads(x.get("ResponseReceived"))
x["IsBadJson"] = False
x["BadJsonStr"] = None
except Exception as error:
print("Found an error value in Tax Json field RequestSent: {}, error details: {}".format(x.get("ResponseReceived"), error))
print("{}-{}-{}".format(None, True, str(x.get("ResponseReceived"))))
x["ResponseReceived"] = None
x["IsBadJson"] = True
x["BadJsonStr"] = str(x.get("ResponseReceived"))
return x
I recommend allowing Dask to load your data directly from CSV, rather than pass it a Pandas dataframe. See https://docs.dask.org/en/latest/best-practices.html#load-data-with-dask

Python null check import xls

There is a python code which reads from a field xls file
The script works, but there are problems when there are empty fields in the file
The script does not read the field if the file has an empty field
My code, for example, here the NORD field is empty:
from msexcel8com import *
def convert(dsIn, dsOut):
import sys
sys.setdefaultencoding("utf-8")
import msexcel8com
xlsApp = msexcel8com.Application()
xlsApp.Workbooks.Open(unicode(dsIn["PATH_TO_XLS"]))
xlsWorkbook = xlsApp.Workbooks.Item(1)
xlsWorksheet = xlsWorkbook.Worksheets.Item(1)
xlsWorksheet.Cells.SpecialCells(11, None).Activate()
rowsCount = xlsApp.ActiveCell.Row
import msxml2
dsOut.clear()
outXML = msxml2.DOMDocument()
RootNode = outXML.createElement("MSG")
RootNode.setAttribute("FORMAT", "IMPORT_LN")
ChildNodes = outXML.appendChild(RootNode)
i, k, c = 1, 1, 2
while i < rowsCount:
i = i + 1
if k > c:
k = 0
dsOut.append()
dsOut["XML_OUT"] = unicode.encode(outXML.xml, "utf-8")
outXML = msxml2.DOMDocument()
RootNode = outXML.createElement("MSG")
RootNode.setAttribute("FORMAT", "IMPORT_LN")
ChildNodes = outXML.appendChild(RootNode)
try:
TMPNode = outXML.createElement("CLIENT")
TMPNode.setAttribute("NCODE", xlsWorksheet.Cells.Item(i, 1).Value)
TMPNode.setAttribute("NORD", xlsWorksheet.Cells.Item(i, 2).Value)
ChildNodes.appendChild(TMPNode)
k = k + 1
except Exception as e:
print(e)
dsOut.append()
dsOut["XML_OUT"] = unicode.encode(outXML.xml, "utf-8")
try:
xlsApp.Workbooks.Close()
except Exception as e:
print(e)
try:
xlsApp.Quit()
except Exception as e:
print(e)
How to make sure that even if there is an empty field, return as null and the rest of the values?
I couldn't resist the temptation of writing this without all that Excel automation.
Assuming an Excel file that looks something like this called so59715137.xls:
import xlrd # assuming it's an .xls, not .xlsx
import xml.etree.ElementTree as et
def read_rows(xls_filename, column_labels):
book = xlrd.open_workbook(xls_filename)
sh = book.sheet_by_index(0)
for rx in range(sh.nrows):
yield dict(zip(column_labels, sh.row_values(rx)))
def convert(xls_filename):
xml_root = et.Element("MSG", {"FORMAT": "IMPORT_LN"})
for row in read_rows(xls_filename, ("NCODE", "NORD")):
print(row) # for debugging
if row.get("NCODE") and row.get("NORD"): # both attributes must be truthy
et.SubElement(xml_root, "CLIENT", attrib=row) # just use the dict as attributes
return et.tostring(xml_root, encoding="unicode")
xml_content = convert("so59715137.xls")
print("-------------------------------")
print(xml_content)
# TODO: write to file
outputs (with debugging output included, so you see it reads but elides the rows that are missing data)
{'NCODE': 'foo', 'NORD': 'faa'}
{'NCODE': 'blep', 'NORD': 'blop'}
{'NCODE': 'missing', 'NORD': ''}
{'NCODE': '', 'NORD': 'other-missing'}
-------------------------------
<MSG FORMAT="IMPORT_LN"><CLIENT NCODE="foo" NORD="faa" /><CLIENT NCODE="blep" NORD="blop" /></MSG>
From there on out, it's easy to read/write your dsIn/dsOut structures.

Categories

Resources