Can we define django model from csv file or pandas dataframe? - python

I have more than 200 columns in csv file and want to define Class in django model.
But I really can't find handful instructions to create django model from csv files (first row indicates columns)
Is there any way to define django model from csv file or pandas dataframe?

I generate texts to feed on model based on below codes
import numpy as np
for col in speech_data.columns:
arr = speech_data[col]
size = 0
value = ""
for ar in arr:
if len(str(ar)) > size:
size = len(str(ar))
value = ar
print col, size, type(speech_data.iloc[0][col]), ar
if type(speech_data.iloc[0][col]) == np.int64:
print col + " = " + "models.IntegerField()"
elif type(speech_data.iloc[0][col]) == str:
print col + " = " + "models.CharField(max_length="+ str(size+3) +")"
elif type(speech_data.iloc[0][col]) == np.float64:
print col + " = " + "models.FloatField(null=True, blank=True, default=None)"
elif type(speech_data.iloc[0][col]) == np.bool_:
print col + " = " + "models.BooleanField()"

Related

Openpyxl module: return weird value(not error) + hope to calculate

I wrote some codes trying to let the user be able to check the percentage of the money they spent(compared to the money they earned). Almost every step perform normally, until the final part.
a_c[('L'+row_t)].value return:
=<Cell 'Sheet1'.B5>/<Cell 'Sheet1'.J5>
yet I hope it should be some value.
Code:
st_column = st_column_r.capitalize()
row_s = str(a_c.max_row)
row_t = str(a_c.max_row + 1)
row = int(row_t)
a_c[('J'+row_t)] = ('=SUM(I2,J'+row_s+')') #總收入
errorprevention = a_c[('J'+row_t)].value
a_c[(st_column+row_t)] = ('=SUM('+(st_column+'2')+','+(st_column+row_s)+')')
a_c['L'+row_t].number_format = FORMAT_PERCENTAGE_00
if errorprevention != 0:
a_c[('L'+row_t)] = ('='+str(a_c[(st_column+row_t)])+'/'+str(a_c[('J'+row_t)]))
print('過往支出中,'+inputtype[st_column]+'類別佔總收入的比率為:'+a_c[('L'+row_t)].value)
Try changing the formula creation to;
a_c[('L' + row_t)].value = '=' + a_c[(st_column + row_t)].coordinate + '/' + a_c[('J' + row_t)].coordinate
or use an f string
a_c[('L' + row_t)].value = f"={a_c[(st_column + row_t)].coordinate}/{a_c[('J' + row_t)].coordinate}"

Load a apreadsheet and copy a row and pasted it in a different location

How can I copy a row for example from D51 to F51 and paste these values in the row T20 to AF20.
I know how to load a spreadsheet
workbook = load_workbook(output)
sheet = workbook.active
But I dont know how to itenarate in a loop to get this
sheet["T2"] = "=D6"
sheet["U2"] = "=E6"
sheet["V2"] = "=F6"
sheet["W2"] = "=G6"
sheet["X2"] = "=H6"
sheet["Y2"] = "=I6"
sheet["Z2"] = "=J6"
sheet["AA2"] = "=K6"
sheet["AB2"] = "=L6"
sheet["AC2"] = "=M6"
sheet["AD2"] = "=N6"
sheet["AE2"] = "=O6"
sheet["AF2"] = "=P6"
You can achieve this by using code below...
Note that the file output.xlsx is opened, updated and saved. The function num_to_excel_col is borrowed from here.
This will update columns 20 (T) onwards for the next 15 columns (all row 2) with the text as "=D6", "=E6", etc. The num_to_col function will convert the col number to equivalent excel string (for eg. 27 will be converted to AA, etc.)
import pandas as pd
import numpy as np
import openpyxl
workbook = openpyxl.load_workbook('output.xlsx')
ws = workbook.active
def num_to_excel_col(n):
if n < 1:
raise ValueError("Number must be positive")
result = ""
while True:
if n > 26:
n, r = divmod(n - 1, 26)
result = chr(r + ord('A')) + result
else:
return chr(n + ord('A') - 1) + result
outcol = 4 #Paste in col 'D'
for col in range(20,35): #col 20 is T and doing this for next 15 columns
txt = "="+num_to_excel_col(outcol)+"6"
print(txt)
ws.cell(row=2, column=col).value = txt
outcol += 1
workbook.save("output.xlsx")

how to compare two strings in pandas large dataframe (python3.x)?

I have two DFs from 2 excel files.
1st file(awcProjectMaster)(1500 records)
projectCode projectName
100101 kupwara
100102 kalaroos
100103 tangdar
2nd file(village master)(more than 10 million records)
villageCode villageName
425638 wara
783651 tangdur
986321 kalaroo
I need to compare the projectName and villageName along with the percentage match.
The following code works fine but it is slow. How can I do the same thing in a more efficient way.
import pandas as pd
from datetime import datetime
df = pd.read_excel("C:\\Users\\Desktop\\awcProjectMaster.xlsx")
df1 = pd.read_excel("C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.xlsx")
def compare(prjCode, prjName, stCode, stName, dCode, dName, sdCode, sdName, vCode, vName):
with open(r"C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.txt", "a") as f:
percentMatch = 0
vLen = len(vName)
prjLen = len(prjName)
if vLen > prjLen:
if vName.find(prjName) != -1:
percentMatch = (prjLen / vLen) * 100
f.write(prjCode + "," + prjName + "," + vCode + "," + vName + "," + str(round(percentMatch)) + "," + stCode + "," + stName + "," + dCode + "," + dName + sdCode + "," + sdName + "\n")
else:
res = 0
# print(res)
elif prjLen >= vLen:
if prjName.find(vName) != -1:
percentMatch = (vLen / prjLen) * 100
f.write(prjCode + "," + prjName + "," + vCode + "," + vName + "," + str(round(percentMatch)) + "," + stCode + "," + stName + "," + dCode + "," + dName + sdCode + "," + sdName + "\n")
else:
res = 0
# print(res)
f.close()
for idx, row in df.iterrows():
for idxv, r in df1.iterrows():
compare(
str(row["ProjectCode"]),
row["ProjectName"].lower(),
str(r["StateCensusCode"]),
r["StateName"],
str(r["DistrictCode"]),
r["DistrictName"],
str(r["SubDistrictCode"]),
r["SubDistrictNameInEnglish"],
str(r["VillageCode"]),
r["VillageNameInEnglish"].lower(),
)
Your distance metric for the strings isn't too accurate, but if it works for you, fine. (You may want to look into other options like the builtin difflib, or the Python-Levenshtein module, though.)
If you really do need to compare 1,500 x 10,000,000 records pairwise, things are bound to take some time, but there are a couple things that we can do pretty easily to speed things up:
open the log file only once; there's overhead, sometimes significant, in that
refactor your comparison function into a separate unit, then apply the lru_cache() memoization decorator to make sure each pair is compared only once, and the subsequent result is cached in memory. (In addition, see how we sort the vName/prjName pair – since the actual order of the two strings doesn't matter, we end up with half the cache size.)
Then for general cleanliness,
use the csv module for streaming CSV into a file (the output format is slightly different than with your code, but you can change this with the dialect parameter to csv.writer()).
Hope this helps!
import pandas as pd
from datetime import datetime
from functools import lru_cache
import csv
df = pd.read_excel("C:\\Users\\Desktop\\awcProjectMaster.xlsx")
df1 = pd.read_excel("C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.xlsx")
log_file = open(r"C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.txt", "a")
log_writer = csv.writer(log_file)
#lru_cache()
def compare_vname_prjname(vName, prjName):
vLen = len(vName)
prjLen = len(prjName)
if vLen > prjLen:
if vName.find(prjName) != -1:
return (prjLen / vLen) * 100
elif prjLen >= vLen:
if prjName.find(vName) != -1:
return (vLen / prjLen) * 100
return None
def compare(prjCode, prjName, stCode, stName, dCode, dName, sdCode, sdName, vCode, vName):
# help the cache decorator out by halving the number of possible pairs:
vName, prjName = sorted([vName, prjName])
percent_match = compare_vname_prjname(vName, prjName)
if percent_match is None: # No match
return False
log_writer.writerow(
[
prjCode,
prjName,
vCode,
vName,
round(percent_match),
stCode,
stName,
dCode,
dName + sdCode,
sdName,
]
)
return True
for idx, row in df.iterrows():
for idxv, r in df1.iterrows():
compare(
str(row["ProjectCode"]),
row["ProjectName"].lower(),
str(r["StateCensusCode"]),
r["StateName"],
str(r["DistrictCode"]),
r["DistrictName"],
str(r["SubDistrictCode"]),
r["SubDistrictNameInEnglish"],
str(r["VillageCode"]),
r["VillageNameInEnglish"].lower(),
)

Python stange behaviour when acessing list elements

This small scripts makes exactly what I need.
#!/usr/bin/python
import os
import fileinput
import sys
import shutil
import glob
import time
def replaceAll1(files,searchExp,replaceExp):
for line in fileinput.input(files, inplace=1):
if searchExp in line:
line = line.replace(searchExp,replaceExp)
sys.stdout.write(line)
param1 = [1,2,3]
param2 = [1,2,3]
param3 = [1,2,3]
for i in xrange(len(param1)):
for ii in xrange(len(param2)):
for iii in xrange(len(param3)):
os.system("cp -a cold.in input.in")
old_param1 = "param1 = 1"
old_param2 = "param2 = 1"
old_param3 = "param3 = 1"
new_param1 = "param1 = " + str(param1[i])
new_param2 = "param2 = " + str(param2[ii])
new_param3 = "param3 = " + str(param3[iii])
replaceAll1('input.in',old_param1,new_param1)
replaceAll1('input.in',old_param2,new_param2)
replaceAll1('input.in',old_param3,new_param3)
time.sleep(4)
It enters in a configuration file and replaces sequentially the input parameters according to the lists that are accessed by the loop indexes. It is simple a combination of all the three parameters between each other.
# Input file
param1 = 1 # --- Should be [1,2,3]
param2 = 1 # --- Should be [1,2,3]
param3 = 1 # --- Should be [1,2,3]
The problem is that his big brother is not behaving like it. When it loops through the lists, it gets lost in scheme = 2 and puts dissp_scheme = 2 (freezed) when it should be dissp_scheme = 1. I printed out every single variable that goes inside the function replaceAll marked with comments but when I turn on the other calls it mess up everything. Here is the script.
#!/usr/bin/python
import os
import fileinput
import sys
import shutil
import glob
import time
os.chdir(os.getcwd())
# Replaces the input file parameters
def replaceAll(files,searchExp,replaceExp):
for line in fileinput.input(files, inplace=1):
if searchExp in line:
line = line.replace(searchExp,replaceExp)
sys.stdout.write(line)
# Gets a number inside my input file.
def get_parameter(variable,file_name):
f = open(file_name,'r').readlines()
for i in xrange(len(f)):
index = f[i].find(variable)
if index != -1:
pre_found = f[i].split('=')[1]
return pre_found
# Gets the discretization scheme name.
def get_sheme(number):
if number == 1:
return "Simple Centered Scheme"
elif number == 2:
return "Lax-Wendroff Scheme"
elif number == 3:
return "MacCormack Scheme"
elif number == 4:
return "Beam-Warming Scheme"
elif number == 5:
return "Steger-Warming 1st Order Scheme"
elif number == 6:
return "Steger-Warming 2nd Order Scheme"
elif number == 7:
return "Van Leer 1st Order Scheme"
elif number == 8:
return "Van Leer 2nd Order Scheme"
elif number == 9:
return "Roe Scheme"
elif number == 10:
return "AUSM Scheme"
# Gets the dissipation scheme name.
def get_dissip(number):
if number == 1:
return "Pullian Non-Linear dissipation"
elif number == 2:
return "Second difference dissipation"
elif number == 3:
return "Fourth difference dissipation"
elif number == 4:
return "B&W dissipation"
# Generates the density gnuplot scripts.
def gnuplot(variable,pressure_ratio,scheme,dissip_scheme):
#gnuplot('Density',10,get_sheme(3),'Pullian')
# Building name of the output file.
outFileName = variable.lower() + '_ratio' + str(int(pressure_ratio)) + '_' + scheme.replace(" ","") + '_dissp' + dissip_scheme.replace(" ","") + '.tex'
gnuFileName = variable.lower() + '_ratio' + str(int(pressure_ratio)) + '_' + scheme.replace(" ","") + '_dissp' + dissip_scheme.replace(" ","") + '.gnu'
# Build title of the plot
title = 'Analytical vs Numerical | ' + scheme
f = open(gnuFileName,'w')
f.write("set term cairolatex monochrome size 15.0cm, 8cm\n")
f.write('set output "' + outFileName + '"\n')
f.write("set grid\n")
f.write('set xtics font "Times-Roman, 10\n')
f.write('set ytics font "Times-Roman, 10\n')
f.write('set xlabel "x position" center\n')
f.write('set ylabel "' + variable + '" center\n')
f.write('set title "Analytical vs Numerical Results | ' + variable + '" \n')
f.write('set pointsize 0.5\n')
f.write('set key font ",10"\n')
fortran_out_analytical = 'a' + variable.lower() + '.out'
fortran_out_numerical = variable.lower() + 'Output.out'
f.write('plot "' + fortran_out_analytical +'" u 1:2 with linespoints lt -1 lw 1 pt 4 title "Analytical",\\\n')
f.write( '"' + fortran_out_numerical + '" u 1:2 with lines lw 5 title "Numerical"\n')
f.close()
# Generate latex code.
def generate_latex(text_image_file,caption):
latex.write("\\begin{figure}[H]\n")
latex.write(" \centering\n")
latex.write(" \input{" + text_image_file + "}\n")
latex.write(" \caption{"+ caption +"}\n")
latex.write(" \label{fig:digraph}\n")
latex.write("\\end{figure}\n")
latex.write("\n\n")
# -----------------------------------------------------------------------
# Main loop.
# -----------------------------------------------------------------------
pressure_ratios = [5.0]
schemes = [1,2,3]
dissips = [1,2,3]
# Define replace lines for replace all.
scheme_line = "scheme = "
dissip_line = "dissp_scheme = "
# Open Latex export file.
latex = open("bizu.txt",'w')
i = 0
# ratios.
for i in xrange(len(pressure_ratios)):
print "----------------------------------------"
print " + Configuring File for pressure ratio: " + str(pressure_ratios[i])
print "----------------------------------------\n"
# Schemes
for jj in xrange(len(schemes)):
print " + Configuring file for scheme: " + get_sheme(schemes[jj]) + "\n"
for kkk in xrange(len(dissips)):
print " + Configuring file for dissip: " + get_dissip(dissips[kkk])
# We always work with a brand new file.
os.system("rm input.in")
os.system("cp -a cold.in input.in")
# Replace pressures.
p1_line_old = 'p1 = 5.0d0'
rho1_line_old = 'rho1 = 5.0d0'
p1_line_new = 'p1 = ' + str(pressure_ratios[i]) + 'd0'
rho1_line_new = 'rho1 = ' + str(pressure_ratios[i]) + 'd0'
replaceAll('input.in',p1_line_old,p1_line_new)
replaceAll('input.in',rho1_line_old,rho1_line_new)
# Replace discretization scheme.
old_scheme = scheme_line + "1"
new_scheme = scheme_line + str(schemes[jj])
#==========================================================
# This call is messing everything up when scheme turns to 2
#==========================================================
replaceAll('input.in',old_scheme,new_scheme)
# Replace dissipation scheme.
old_dissp_scheme = dissip_line + "1"
new_dissp_scheme = dissip_line + str(dissips[kkk])
print p1_line_old
print new_scheme
print new_dissp_scheme
replaceAll('input.in',old_dissp_scheme, new_dissp_scheme)
time.sleep(3)
# ### Calling program
# os.system("./sod")
#
latex.close()
And the input file that the it works on is:
&PAR_physical
p1 = 5.0d0
p4 = 1.0d0
rho1 = 5.0d0
rho4 = 1.0d0
fgamma = 1.4d0
R_const = 287.0d0
F_Cp = 1004.5
F_Cv = 717.5
/
&PAR_geometry
total_mesh_points = 1001
start_mesh_point = -5.0d0
final_mesh_point = 5.0d0
print_step = 100
/
&PAR_numeric
scheme = 3
iterations = 10000
time_step = 0.0001d0
/
&PAR_dissip
dissp_scheme = 3
dissip_omega = 0.5d0
/
Thank you all !

how to train the text importing from xlsx file using python classifier as SVM..?/

I have already imported the data using the following code but the problem is how to use it to implement svm.
from xlrd import open_workbook
import nltk
rb = open_workbook('/Users/tejeshwar/Downloads/Defect-Data.xlsx')
sheet = rb.sheet_by_index(0)
data = ()
for row_index in xrange(1, sheet.nrows): #train using 500
temp,add = (),()
desc,cat = 0,0 #trial
for col_index in xrange(sheet.ncols):
if col_index==2:
#print col_index
desc = sheet.cell(row_index,col_index).value
#print desc
#print cellname(row_index,col_index)
desc = "'" + desc
#temp +=(desc,)
#print temp
elif col_index==5:
#print col_index
cat = sheet.cell(row_index,col_index).value
#print cat
#print cellname(row_index,col_index)
cat = "'" + cat + "'"
add = add + (desc,cat)
#print (add)
data = data + (add,)
print 'done'
training_data = list(data)`
where desc is the description and cat is category
And i have to train the model in a manner if give the description it should predict or classify it to category

Categories

Resources