How to random swap CFG grammar? - python

I want to randomly swap the grammar I defined for CFG. For example I have this grammar:
grammar = CFG.fromstring(f"""
S -> NP VP | VP NP
NP -> DET N| ADJ N
VP -> V NP | V ADJ | V ADV
V -> {VERB}
N -> {NOUN}
ADJ ->{ADJ}
DET -> {DET}
ADV -> {ADV}
""")
I want to randomly swap S -> NP VP | VP NP to S -> VP NP | NP VP in each iteration and in each iteration I'll be generate only one sentence.
import spacy
import codecs
import nltk
from nltk.parse.generate import generate,demo_grammar
import random
from nltk import Nonterminal
import substitute
from nltk.grammar import CFG
nlp = spacy.load("en_core_web_sm")
fin = [
"Wow!The movie was a complete joy to watch, with an incredible cast delivering fantastic performances. The special effects were stunning. I highly recommend this movie to everyone."
]
for line in fin:
sent = line #Stripping the dataset based on tab. That is stripping label from sentence
words = [x.lower() for x in nltk.word_tokenize(sent)] #lowering the sentence and tokenizing
sent = ' '.join(words)
text = sent
doc = nlp(text)
noun = []
verb = []
adj = []
det = []
adv = []
pq = []
pp = []
intj = []
part = []
not_or_no = ""
# Token and Tag
for token in doc:
if token.text=="not" or token.text=="no":
print("found")
not_or_no = token.text
continue
# print(token.pos_)
if token.pos_ == "NOUN" or token.pos_ == "PROPN" or token.pos_ == "PRON":
noun.append(not_or_no + " " + token.text)
elif token.pos_ == "VERB" or token.pos_ == "AUX":
verb.append(not_or_no + " " + token.text)
elif token.pos_ == "DET":
det.append(token)
elif token.pos_ == "ADJ":
adj.append(not_or_no + " " + token.text)
elif token.pos_ == "ADV":
adv.append(not_or_no + " " + token.text)
not_or_no = ""
NOUN = ''
VERB = ' '
DET = ''
ADV = ''
ADJ = ' '
CONJ = ''
ADP = ''
for i in range(0,len(noun)):
NOUN += '"' + str(noun[i]) + '"'
if i != len(noun)-1 and len(noun) != 1:
NOUN += ' | '
for i in range(0,len(verb)):
VERB += '"' + str(verb[i])+ '"'
if i != len(verb) - 1 and len(verb) != 1 :
VERB += ' | '
for i in range(0,len(det)):
DET += '"'+ str(det[i])+ '"'
if i != len(det) - 1 and len(det) != 1:
DET += ' | '
for i in range(0,len(adj)):
ADJ += '"'+ str(adj[i])+ '"'
if i != len(adj) - 1 and len(adj) != 1:
ADJ += ' | '
for i in range(0,len(adv)):
ADV += '"'+ str(adv[i])+ '"'
if i != len(adv) - 1 and len(adv) != 1:
ADV += ' | '
grammar = CFG.fromstring(f"""
S -> NP VP | VP NP
NP -> DET N| ADJ N
VP -> V NP | V ADJ | V ADV
V -> {VERB}
N -> {NOUN}
ADJ ->{ADJ}
DET -> {DET}
ADV -> {ADV}
""")
for i in range(50):
# Swapping the `S` rule randomly
if random.choice([True, False]):
new_grammar = grammar.substitute(NP=grammar.productions()[0].rhs(), S=grammar.productions()[1].rhs())
else:
new_grammar = grammar.substitute(VP=grammar.productions()[0].rhs(), S=grammar.productions()[1].rhs())
for sentence in generate(new_grammar, n=1):
new_sent = ' '.join(sentence)
print(new_sent)
Traceback:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-26-81e9ddd4e9ec>", line 7, in <module>
import substitute
File "/usr/local/lib/python3.8/dist-packages/substitute/__init__.py", line 1, in <module>
from .substitute import *
File "/usr/local/lib/python3.8/dist-packages/substitute/substitute.py", line 126
equals = lambda (a,b):a==b
^
SyntaxError: invalid syntax
Here I will randomly swap each Nonterminal and generate a sentence of the randomly swapped grammar structure. But how can I randomly swap those nonterminal like S, NP, VP

Try this:
import random
from nltk.grammar import CFG
grammar = CFG.fromstring(f"""
S -> NP VP | VP NP
NP -> DET N| ADJ N
VP -> V NP | V ADJ | V ADV
V -> {VERB}
N -> {NOUN}
ADJ ->{ADJ}
DET -> {DET}
ADV -> {ADV}
""")
for i in range(50):
# Swapping the `S` rule randomly
if random.choice([True, False]):
new_grammar = grammar.substitute(NP=grammar.productions()[0].rhs(), S=grammar.productions()[1].rhs())
else:
new_grammar = grammar.substitute(VP=grammar.productions()[0].rhs(), S=grammar.productions()[1].rhs())
for sentence in generate(new_grammar, n=1):
new_sent = ' '.join(sentence)
print(new_sent)
Here substitute method of the CFG class is used to create a new grammar with a nonterminal rule. random.choice([True, False]) is used to give either True or False so that we can randomly select the swap we want to make.
Edit
Try this:
for i in range(50):
if random.choice([True, False]):
new_grammar = CFG.fromstring(f"""
S -> VP NP | NP VP
NP -> DET N | ADJ N
VP -> V NP | V ADJ | V ADV
V -> {VERB}
N -> {NOUN}
ADJ -> {ADJ}
DET -> {DET}
ADV -> {ADV}
""")
else:
new_grammar = grammar
for sentence in generate(new_grammar, n=1):
new_sent = ' '.join(sentence)
print(new_sent)

Related

How to convert python CPU script to runs on GPU

How to edit a python script that runs on CPU to runs on GPU which support cuda?
here is the code
import numpy as np
import pandas as pd
from datetime import datetime as dt
import scipy.signal
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import os
from multiprocessing import Pool
import tqdm
def get_start(l):
for i,line in enumerate(l):
if 'Zeit' in line or 'Time' in line:
return i+1
def get_meas2(l):
index = {
0 :'fl',
1 :'fr',
2 :'rl',
3 :'rr',
4 :'rfl',
5 :'rfr',
6 :'rrl',
7 :'rrr',
}
res = {
'fl' : None ,
'fr' : None ,
'rl' : None ,
'rr' : None ,
'rfl' : None ,
'rfr' : None ,
'rrl' : None ,
'rrr' : None ,
}
res_c =res.copy()
meas = []
i=0
s = get_start(l)
for line in l[s:]:
sp = line.split(';')
sensor = index[int(sp[1])]
tt = sp[0].replace(' PM','').replace(' AM','')
t = dt.strptime (tt,'%m/%d/%Y %H:%M:%S')
val = float(sp[2].replace(',','.'))
if res_c[sensor]==None:
res_c[sensor] = val
else:
res_c['sample'] = i
res_c['time'] = t
i+=1
for k in res_c.keys():
if res_c[k]==None:
res_c[k]=0
meas.append(res_c)
res_c = res.copy()
res_c[sensor] = val
return meas
def read_csv1(filepath):
l = open(filepath,'r',errors='backslashreplace').readlines()
js ={
'createdBy': None ,
'csvFileName': None ,
'eftFileName': None ,
'hasCoDriver': None ,
'hasDriver': None ,
'measurement': None ,
'plateHeight': None ,
'plateLength': 600 ,
'rating': None ,
'recordedOn': None ,
'remarks': None ,
'sampleRate': 50 ,
'vehicleChassis': None ,
'vehicleManufacturer': None ,
'vehicleModel': None ,
'vehicleType': None ,
'vehicleWheelBase': 2650 ,
'vehicleYearOfConstruction': None ,
'version': None ,
}
js['measurement'] = get_meas2(l)
# return js
return parse_data(js)
def parse_data(data):
measurments = {}
for key in data["measurement"][0].keys():
measurments[key] = [d[key] for d in data["measurement"]]
return measurments
def AVG2(a,n):
windows = []
for k,v in n.items():
windows += [k] * v
i=0
res = []
cont = True
while cont:
for w in windows:
if i+w <len(a):
res.append(np.mean(a[i:i+w]))
i +=w
else:
cont = False
return res
def findFirstRise(a):
n = 5
m = 20
for i in range(len(a)-11):
if a[i+m] >1 and a[i]!=0:
if a[i+m]/a[i] > n:
return i
def sync(a, window={4:10,5:1}, k=0.0065, Wn=90, fs=5000):
# Step 1 --> under sampling
a = [float(i) for i in a if i!='#VALUE!' and i== i]
a = AVG2(a,window)
#Setp 2 --> scaling
a = np.vectorize(lambda x: np.abs(x-np.mean(a)))(a)
# k = np.mean(GT)/np.mean(a)
a = np.vectorize(lambda x: x*k)(a)
#step 3 --> iir filter
bb, aa = scipy.signal.iirfilter(1, Wn=Wn, fs=fs, btype="low", ftype="butter")
y_lfilter = scipy.signal.lfilter(bb, aa, a)
# y_lfilter = scipy.signal.lfilter((0.0661221,0.0661221), (1.,-0.8677558), a)
return y_lfilter
def syncP(a,param):
ik,iwn,ifs,i4,i5 = param
window={4:i4,5:i5}
k=ik
Wn=iwn
fs=ifs
# Step 1 --> under sampling
a = [float(i) for i in a if i!='#VALUE!' and i== i]
a = AVG2(a,window)
#Setp 2 --> scaling
a = np.vectorize(lambda x: np.abs(x-np.mean(a)))(a)
# k = np.mean(GT)/np.mean(a)
a = np.vectorize(lambda x: x*k)(a)
#step 3 --> iir filter
bb, aa = scipy.signal.iirfilter(1, Wn=Wn, fs=fs, btype="low", ftype="butter")
# print(bb, aa, sep="\n")
y_lfilter = scipy.signal.lfilter(bb, aa, a)
# y_lfilter = scipy.signal.lfilter((0.0661221,0.0661221), (1.,-0.8677558), a)
return y_lfilter
def syncOfsset(a,sb,offset=None):
if not offset:
offset = findFirstRise(a) - findFirstRise(sb)
if offset >0:
a = a[offset:]
else:
sb = sb[np.abs(offset):]
l = min(len(a),len(sb))
return a[:l],sb[:l],(offset,l)
def findOffset(a,sb):
return findFirstRise(a) - findFirstRise(sb)
def mae(a,sb):
if len(a) != len(sb): raise ValueError(f'len(a)= {len(a)} while len(sb) = {len(sb)}')
return np.abs(np.subtract(a,sb)).mean()
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
def getMoreThan3(a1,sb1):
aa = []
last=False
for i,v in enumerate(a1):
if v>3:
aa.append(True)
last=True
else:
if last:
if True in [j>3 for j in a1[i:i+1]]:
aa.append(True)
last = True
else:
aa.append(False)
last = False
else:
aa.append(False)
last = False
ca=[]
cb=[]
index = []
for i,(ai,sbi,t) in enumerate(zip(a1,sb1,aa)):
if t:
ca.append(ai)
cb.append(sbi)
index.append(i)
return ca,cb
def check2(a,b,param):
ik,iwn,ifs,i4,i5 = param
sb = sync(b, window={4:i4,5:i5}, k=ik ,Wn=iwn, fs=ifs)
a1,sb1,data = syncOfsset(a,sb)
offset,l = data
r2 = r2_score(a1,sb1)
for i in range(-5,5):
a2,sb2,data2 = syncOfsset(a,sb,offset=offset+i)
r22 = r2_score(a2,sb2)
if r22>r2:
a1,sb1,data = a2,sb2,data2
ca,cb = getMoreThan3(a1,sb1)
return mae(ca,cb),rmse(ca,cb),r2_score(ca,cb),pearsonr(ca,cb)[0],(a,sb),data
def check(a,b,param):
ik,iwn,ifs,i4,i5 = param
sb = sync(b, window={4:i4,5:i5}, k=ik ,Wn=iwn, fs=ifs)
a,sb,data = syncOfsset(a,sb)
ca,cb = getMoreThan3(a,sb)
return mae(ca,cb),rmse(ca,cb),r2_score(ca,cb),pearsonr(ca,cb)[0],(a,sb),data
def rmse(a,sb):
if len(a) != len(sb): raise ValueError(f'len(a)= {len(a)} while len(sb) = {len(sb)}')
return np.sqrt(np.square(np.subtract(a,sb)).mean())
def analyse(i,param):
f = f"test{i}.csv"
d1 = read_csv1(os.path.join('sensor1',f))
selSen = selectSensor(d1)
d1 = d1[selSen]
d3 = pd.read_csv(os.path.join('sensor3',f))['Unnamed: 2']
return check2(d1,d3,param),selSen
def selectSensor(a):
s = {}
for sensor in a:
if len(sensor)==2:
s[sensor] = max(a[sensor])
return max(s,key=s.get)
def handle_process(param):
i = 4
(res1,res2,r2,pr, _,data),selSen = analyse(i,param)
with open('results.csv','a') as f:
line = str(param) + ',' + str(res1) + ',' + str(res2) + ',' +str(r2) + ',' +str(pr) + ',' +str(data) + ',' +str(selSen) + ',' + '\n'
line = line.replace('(','').replace(')','')
f.write(line)
if __name__=='__main__':
base_dir1 = 'sensor1'
base_dir3 = 'sensor3'
rWn = range(80,100,5)
rK = range(65,80,5)
r4W = range(9,12,1)
r5W = range(0,3)
rFs = range(4000,6000,200)
params = []
for iWn in rWn:
for iFs in rFs:
for i4W in r4W:
for i5W in r5W:
for ik in rK:
params.append((ik/10000,iWn,iFs,i4W,i5W))
with open('results.csv','a') as f:
line = ['k','wn','fs','window4','window5',\
'mae(>3)','rmse(>3)','r2(>3)','pearsonr(>3)',\
'OffsetStart','length','SelectedSensor','\n']
f.write(','.join(line))
f.close()
pool = Pool(processes=8)
for _ in tqdm.tqdm(pool.imap_unordered(handle_process, params), total=len(params)):
pass
To make sure that I have the ability to run the code on the GPU, I successfully tested the sample code on my laptop and based on it, I modified my code as follows; But I am facing the following error after execution.
from numba import jit, cuda
import numpy as np
import pandas as pd
from datetime import datetime as dt
import scipy.signal
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import os
from multiprocessing import Pool
import tqdm
def get_start(l):
for i,line in enumerate(l):
if 'Zeit' in line or 'Time' in line:
return i+1
def get_meas2(l):
index = {
0 :'fl',
1 :'fr',
2 :'rl',
3 :'rr',
4 :'rfl',
5 :'rfr',
6 :'rrl',
7 :'rrr',
}
res = {
'fl' : None ,
'fr' : None ,
'rl' : None ,
'rr' : None ,
'rfl' : None ,
'rfr' : None ,
'rrl' : None ,
'rrr' : None ,
}
res_c =res.copy()
meas = []
i=0
s = get_start(l)
for line in l[s:]:
sp = line.split(';')
sensor = index[int(sp[1])]
tt = sp[0].replace(' PM','').replace(' AM','')
t = dt.strptime (tt,'%m/%d/%Y %H:%M:%S')
val = float(sp[2].replace(',','.'))
if res_c[sensor]==None:
res_c[sensor] = val
else:
res_c['sample'] = i
res_c['time'] = t
i+=1
for k in res_c.keys():
if res_c[k]==None:
res_c[k]=0
meas.append(res_c)
res_c = res.copy()
res_c[sensor] = val
return meas
def read_csv1(filepath):
l = open(filepath,'r',errors='backslashreplace').readlines()
js ={
'createdBy': None ,
'csvFileName': None ,
'eftFileName': None ,
'hasCoDriver': None ,
'hasDriver': None ,
'measurement': None ,
'plateHeight': None ,
'plateLength': 600 ,
'rating': None ,
'recordedOn': None ,
'remarks': None ,
'sampleRate': 50 ,
'vehicleChassis': None ,
'vehicleManufacturer': None ,
'vehicleModel': None ,
'vehicleType': None ,
'vehicleWheelBase': 2650 ,
'vehicleYearOfConstruction': None ,
'version': None ,
}
js['measurement'] = get_meas2(l)
# return js
return parse_data(js)
def parse_data(data):
measurments = {}
for key in data["measurement"][0].keys():
measurments[key] = [d[key] for d in data["measurement"]]
return measurments
def AVG2(a,n):
windows = []
for k,v in n.items():
windows += [k] * v
i=0
res = []
cont = True
while cont:
for w in windows:
if i+w <len(a):
res.append(np.mean(a[i:i+w]))
i +=w
else:
cont = False
return res
def findFirstRise(a):
n = 5
m = 20
for i in range(len(a)-11):
if a[i+m] >1 and a[i]!=0:
if a[i+m]/a[i] > n:
return i
def sync(a, window={4:10,5:1}, k=0.0065, Wn=90, fs=5000):
# Step 1 --> under sampling
a = [float(i) for i in a if i!='#VALUE!' and i== i]
a = AVG2(a,window)
#Setp 2 --> scaling
a = np.vectorize(lambda x: np.abs(x-np.mean(a)))(a)
# k = np.mean(GT)/np.mean(a)
a = np.vectorize(lambda x: x*k)(a)
#step 3 --> iir filter
bb, aa = scipy.signal.iirfilter(1, Wn=Wn, fs=fs, btype="low", ftype="butter")
y_lfilter = scipy.signal.lfilter(bb, aa, a)
# y_lfilter = scipy.signal.lfilter((0.0661221,0.0661221), (1.,-0.8677558), a)
return y_lfilter
def syncP(a,param):
ik,iwn,ifs,i4,i5 = param
window={4:i4,5:i5}
k=ik
Wn=iwn
fs=ifs
# Step 1 --> under sampling
a = [float(i) for i in a if i!='#VALUE!' and i== i]
a = AVG2(a,window)
#Setp 2 --> scaling
a = np.vectorize(lambda x: np.abs(x-np.mean(a)))(a)
# k = np.mean(GT)/np.mean(a)
a = np.vectorize(lambda x: x*k)(a)
#step 3 --> iir filter
bb, aa = scipy.signal.iirfilter(1, Wn=Wn, fs=fs, btype="low", ftype="butter")
# print(bb, aa, sep="\n")
y_lfilter = scipy.signal.lfilter(bb, aa, a)
# y_lfilter = scipy.signal.lfilter((0.0661221,0.0661221), (1.,-0.8677558), a)
return y_lfilter
def syncOfsset(a,sb,offset=None):
if not offset:
offset = findFirstRise(a) - findFirstRise(sb)
if offset >0:
a = a[offset:]
else:
sb = sb[np.abs(offset):]
l = min(len(a),len(sb))
return a[:l],sb[:l],(offset,l)
def findOffset(a,sb):
return findFirstRise(a) - findFirstRise(sb)
def mae(a,sb):
if len(a) != len(sb): raise ValueError(f'len(a)= {len(a)} while len(sb) = {len(sb)}')
return np.abs(np.subtract(a,sb)).mean()
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
def getMoreThan3(a1,sb1):
aa = []
last=False
for i,v in enumerate(a1):
if v>3:
aa.append(True)
last=True
else:
if last:
if True in [j>3 for j in a1[i:i+1]]:
aa.append(True)
last = True
else:
aa.append(False)
last = False
else:
aa.append(False)
last = False
ca=[]
cb=[]
index = []
for i,(ai,sbi,t) in enumerate(zip(a1,sb1,aa)):
if t:
ca.append(ai)
cb.append(sbi)
index.append(i)
return ca,cb
#jit(target_backend='cuda')
def check2(a,b,param):
ik,iwn,ifs,i4,i5 = param
sb = sync(b, window={4:i4,5:i5}, k=ik ,Wn=iwn, fs=ifs)
a1,sb1,data = syncOfsset(a,sb)
offset,l = data
r2 = r2_score(a1,sb1)
for i in range(-5,5):
a2,sb2,data2 = syncOfsset(a,sb,offset=offset+i)
r22 = r2_score(a2,sb2)
if r22>r2:
a1,sb1,data = a2,sb2,data2
ca,cb = getMoreThan3(a1,sb1)
return mae(ca,cb),rmse(ca,cb),r2_score(ca,cb),pearsonr(ca,cb)[0],(a,sb),data
#jit(target_backend='cuda')
def check(a,b,param):
ik,iwn,ifs,i4,i5 = param
sb = sync(b, window={4:i4,5:i5}, k=ik ,Wn=iwn, fs=ifs)
a,sb,data = syncOfsset(a,sb)
ca,cb = getMoreThan3(a,sb)
return mae(ca,cb),rmse(ca,cb),r2_score(ca,cb),pearsonr(ca,cb)[0],(a,sb),data
#jit(target_backend='cuda')
def rmse(a,sb):
if len(a) != len(sb): raise ValueError(f'len(a)= {len(a)} while len(sb) = {len(sb)}')
return np.sqrt(np.square(np.subtract(a,sb)).mean())
#jit(target_backend='cuda')
def analyse(i,param):
f = f"test{i}.csv"
d1 = read_csv1(os.path.join('sensor1',f))
selSen = selectSensor(d1)
d1 = d1[selSen]
d3 = pd.read_csv(os.path.join('sensor3',f))['Unnamed: 2']
return check2(d1,d3,param),selSen
#jit(target_backend='cuda')
def selectSensor(a):
s = {}
for sensor in a:
if len(sensor)==2:
s[sensor] = max(a[sensor])
return max(s,key=s.get)
#jit(target_backend='cuda')
def handle_process(param):
i = 4
(res1,res2,r2,pr, _,data),selSen = analyse(i,param)
with open('results.csv','a') as f:
line = str(param) + ',' + str(res1) + ',' + str(res2) + ',' +str(r2) + ',' +str(pr) + ',' +str(data) + ',' +str(selSen) + ',' + '\n'
line = line.replace('(','').replace(')','')
f.write(line)
if __name__=='__main__':
base_dir1 = 'sensor1'
base_dir3 = 'sensor3'
rWn = range(80,100,5)
rK = range(65,80,5)
r4W = range(9,12,1)
r5W = range(0,3)
rFs = range(4000,6000,200)
params = []
for iWn in rWn:
for iFs in rFs:
for i4W in r4W:
for i5W in r5W:
for ik in rK:
params.append((ik/10000,iWn,iFs,i4W,i5W))
with open('results.csv','a') as f:
line = ['k','wn','fs','window4','window5',\
'mae(>3)','rmse(>3)','r2(>3)','pearsonr(>3)',\
'OffsetStart','length','SelectedSensor','\n']
f.write(','.join(line))
f.close()
pool = Pool(processes=1)
for _ in tqdm.tqdm(pool.imap_unordered(handle_process, params), total=len(params)):
pass
But I receive error:
0%| | 0/1080 [00:05<?, ?it/s]
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\ProgramData\miniconda3\lib\multiprocessing\pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "C:\ProgramData\miniconda3\lib\site-packages\numba\core\dispatcher.py", line 471, in _compile_for_args
error_rewrite(e, 'unsupported_error')
File "C:\ProgramData\miniconda3\lib\site-packages\numba\core\dispatcher.py", line 409, in error_rewrite
raise e.with_traceback(None)
numba.core.errors.UnsupportedError: Failed in object mode pipeline (step: analyzing bytecode)
The 'with (context manager) as (variable):' construct is not supported.
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\Amin\Downloads\AnalyseSelectSensor\analyseoneg.py", line 299, in <module>
for _ in tqdm.tqdm(pool.imap_unordered(handle_process, params), total=len(params)):
File "C:\ProgramData\miniconda3\lib\site-packages\tqdm\std.py", line 1195, in __iter__
for obj in iterable:
File "C:\ProgramData\miniconda3\lib\multiprocessing\pool.py", line 873, in next
raise value
numba.core.errors.UnsupportedError: Failed in object mode pipeline (step: analyzing bytecode)
The 'with (context manager) as (variable):' construct is not supported.
Do you have any idea, how I can solve this issue?
Kind regards.

Confidence score of answer extracted using ELMo BiDAF model and AllenNLP

I'm working on a Deep Learning project where I use a bidirectional attention flow model (allennlp pretrained model)to make a question answering system.It uses squad dataset.The bidaf model extracts the answer span from paragraph.Is there any way to determine the confidence score(accuracy)or any other metrics of the answer extracted by the model?
I have used the subcommand evaluate from the allennlp package but it determines only score of the model after testing.I was hoping there is a much easier way to solve the issue using other such command.
Attaching the code and the terminal output below.
from rake_nltk import Rake
from string import punctuation
from nltk.corpus import stopwords
from allennlp.predictors.predictor import Predictor
import spacy
import wikipedia
import re
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import traceback
from nltk.stem import SnowballStemmer
from nltk.util import ngrams
from math import log10
from flask import Flask, request, jsonify, render_template
from gevent.pywsgi import WSGIServer
import time
import multiprocessing as mp
from gtts import gTTS
import os
NLP = spacy.load('en_core_web_md')
stop = stopwords.words('english')
symbol = r"""!#$%^&*();:\n\t\\\"!\{\}\[\]<>-\?"""
stemmer = SnowballStemmer('english')
wikipedia.set_rate_limiting(True)
session = HTMLSession()
results = 5
try:
predictor = Predictor.from_path("bidaf-model-2017.09.15-charpad.tar.gz")
except:
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bidaf-elmo-model-2018.11.30-charpad.tar.gz")
try:
srl = Predictor.from_path('srl-model-2018.05.25.tar.gz')
except:
srl = Predictor.from_path('https://s3-us-west-2.amazonaws.com/allennlp/models/bert-base-srl-2019.06.17.tar.gz')
key = Rake(min_length=1, stopwords=stop, punctuations=punctuation, max_length=6)
wh_words = "who|what|how|where|when|why|which|whom|whose|explain".split('|')
stop.extend(wh_words)
session = HTMLSession()
output = mp.Queue()
def termFrequency(term, doc):
normalizeTermFreq = re.sub('[\[\]\{\}\(\)]', '', doc.lower()).split()
normalizeTermFreq = [stemmer.stem(i) for i in normalizeTermFreq]
dl = len(normalizeTermFreq)
normalizeTermFreq = ' '.join(normalizeTermFreq)
term_in_document = normalizeTermFreq.count(term)
#len_of_document = len(normalizeTermFreq )
#normalized_tf = term_in_document / len_of_document
normalized_tf = term_in_document
return normalized_tf, normalizeTermFreq, dl#, n_unique_term
def inverseDocumentFrequency(term, allDocs):
num_docs_with_given_term = 0
for doc in allDocs:
if term in doc:
num_docs_with_given_term += 1
if num_docs_with_given_term > 0:
total_num_docs = len(allDocs)
idf_val = log10(((total_num_docs+1) / num_docs_with_given_term))
term_split = term.split()
if len(term_split) == 3:
if len([term_split[i] for i in [0, 2] if term_split[i] not in stop]) == 2:
return idf_val*1.5
return idf_val
return idf_val
else:
return 0
def sent_formation(question, answer):
tags_doc = NLP(question)
tags_doc_cased = NLP(question.title())
tags_dict_cased = {i.lower_:i.pos_ for i in tags_doc_cased}
tags_dict = {i.lower_:i.pos_ for i in tags_doc}
question_cased = []
for i in question[:-1].split():
if tags_dict[i] == 'PROPN' or tags_dict[i] == 'NOUN':
question_cased.append(i.title())
else:
question_cased.append(i.lower())
question_cased.append('?')
question_cased = ' '.join(question_cased)
#del tags_dict,tags_doc, tags_doc_cased
pre = srl.predict(question_cased)
verbs = []
arg1 = []
for i in pre['verbs']:
verbs.append(i['verb'])
if 'B-ARG1' in i['tags']:
arg1.append((i['tags'].index('B-ARG1'), i['tags'].count('I-ARG1'))\
if not pre['words'][i['tags'].index('B-ARG1')].lower() in wh_words else \
(i['tags'].index('B-ARG2'), i['tags'].count('I-ARG2')))
arg1 = arg1[0] if arg1 else []
if not arg1:
verb_idx = pre['verbs'][0]['tags'].index('B-V')
verb = pre['words'][verb_idx] if pre['words'][verb_idx] != answer.split()[0].lower() else ''
subj_uncased = pre['words'][verb_idx+1:] if pre['words'][-1] not in symbol else \
pre['words'][verb_idx+1:-1]
else:
verb = ' '.join(verbs)
subj_uncased = pre['words'][arg1[0]:arg1[0]+arg1[1]+1]
conj = ''
if question.split()[0].lower() == 'when':
conj = ' on' if len(answer.split()) > 1 else ' in'
subj = []
for n, i in enumerate(subj_uncased):
if tags_dict_cased[i.lower()] == 'PROPN' and tags_dict[i.lower()] != 'VERB' or n == 0:
subj.append(i.title())
else:
subj.append(i.lower())
subj[0] = subj[0].title()
print(subj)
print(pre)
subj = ' '.join(subj)
sent = "{} {}{} {}.".format(subj, verb, conj, answer if answer[-1] != '.' else answer[:-1])
return sent
class extractAnswer:
def __init__(self):
self.wiki_error = (wikipedia.exceptions.DisambiguationError,
wikipedia.exceptions.HTTPTimeoutError,
wikipedia.exceptions.WikipediaException)
self.article_title = None
# symbol = """!#$%^&*();:\n\t\\\"!\{\}\[\]<>-\?"""
def extractAnswer_model(self, passage, question, s=0.4, e=0.3, wiki=False):
if type(passage) == list:
passage = " ".join(passage)
if not question[-1] == '?':
question = question+'?'
pre = predictor.predict(passage=passage, question=question)
if wiki:
if max(pre['span_end_probs']) > 0.5:
s = 0.12
elif max(pre['span_end_probs']) > 0.4:
s = 0.13
elif max(pre['span_end_probs']) > 0.35:
s = 0.14
if max(pre['span_start_probs']) > 0.5:
e = 0.12
elif max(pre['span_start_probs']) > 0.4:
e = 0.14
elif max(pre['span_start_probs']) > 0.3:
e = 0.15
if max(pre['span_start_probs']) > s and max(pre['span_end_probs']) > e:
key.extract_keywords_from_text(question)
ques_key = [stemmer.stem(i) for i in ' '.join(key.get_ranked_phrases())]
key.extract_keywords_from_text(passage)
pass_key = [stemmer.stem(i) for i in ' '.join(key.get_ranked_phrases())]
l = len(ques_key)
c = 0
for i in ques_key:
if i in pass_key:
c += 1
if c >= l/2:
print(max(pre['span_start_probs']),
max(pre['span_end_probs']))
if wiki:
return pre['best_span_str'], max(pre['span_start_probs']) + max(pre['span_end_probs'])
try:
ans = sent_formation(question, pre['best_span_str'])
except:
ans = pre['best_span_str']
print(traceback.format_exc())
return ans
print(ques_key, c, l)
print(max(pre['span_start_probs']), max(pre['span_end_probs']))
return 0, 0
else:
print(max(pre['span_start_probs']), max(pre['span_end_probs']), pre['best_span_str'])
return 0, 0
def wiki_search_api(self, query):
article_list = []
try:
article_list.extend(wikipedia.search(query, results=results))
print(article_list)
return article_list
except self.wiki_error:
params = {'search': query, 'profile': 'engine_autoselect',
'format': 'json', 'limit': results}
article_list.extend(requests.get('https://en.wikipedia.org/w/api.php?action=opensearch',
params=params).json()[1])
return article_list
except:
print('Wikipedia search error!')
print(traceback.format_exc())
return 0
def wiki_passage_api(self, article_title, article_list, output):
# Disambiguation_title = {}
try:
passage = wikipedia.summary(article_title)
output.put((article_title, self.passage_pre(passage)))
except wikipedia.exceptions.DisambiguationError as e:
print(e.options[0], e.options)
Disambiguation_pass = {}
for p in range(2 if len(e.options) > 1 else len(e.options)):
params = {'search':e.options[p], 'profile':'engine_autoselect', 'format':'json'}
article_url = requests.get('https://en.wikipedia.org/w/api.php?action=opensearch',
params=params).json()
if not article_url[3]:
continue
article_url = article_url[3][0]
r = session.get(article_url)
soup = BeautifulSoup(r.html.raw_html)
print(soup.title.string)
article_title_dis = soup.title.string.rsplit('-')[0].strip()
if article_title_dis in article_list:
print('continue')
continue
try:
url = "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&redirects=1&titles={}".format(article_title_dis)
passage = requests.get(url).json()['query']['pages']
for i in passage.keys():
if 'extract' in passage[i]:
Disambiguation_pass[article_title_dis] = self.passage_pre(passage[i]['extract'])
except wikipedia.exceptions.HTTPTimeoutError:
passage = wikipedia.summary(article_title_dis)
Disambiguation_pass[article_title_dis] = self.passage_pre(passage)
except:
Disambiguation_pass[article_title_dis] = ''
continue
output.put((article_title, Disambiguation_pass))
except:
output.put((article_title, ''))
print(traceback.format_exc())
def sorting(self, article, question, topic):
processes = [mp.Process(target=self.wiki_passage_api, args=(article[x], article, output))\
for x in range(len(article))]
for p in processes:
p.start()
for p in processes:
p.join(timeout=3)
results_p = [output.get() for p in processes]
article_list = []
passage_list = []
for i, j in results_p:
if type(j) != dict and j:
article_list.append(i)
passage_list.append(j)
elif type(j) == dict and j:
for k, l in j.items():
if l:
article_list.append(k)
passage_list.append(l)
normalize_passage_list = []
start = time.time()
keywords = " ".join(self.noun+self.ques_key+[topic.lower()])
keywords = re.sub('[{0}]'.format(symbol), ' ', keywords).split()
question = question+' '+topic
ques_tokens = [stemmer.stem(i.lower()) for i in question.split() \
if i.lower() not in wh_words]
print(ques_tokens)
keywords_bigram = [' '.join(i) for i in list(ngrams(ques_tokens, 2)) \
if i[0] not in stop and i[1] not in stop]
if len(ques_tokens) > 3:
keywords_trigram = [' '.join(i) for i in list(ngrams(ques_tokens, 3)) \
if (i[0] in stop) + (i[2] in stop) + (i[1] in stop) < 3]
else:
keywords_trigram = []
if len(ques_tokens) > 5:
keywords_4gram = [' '.join(i) for i in list(ngrams(ques_tokens, 4)) \
if (i[0] in stop) + (i[2] in stop) +(i[1] in stop)+(i[3] in stop) < 4]
else:
keywords_4gram = []
keywords_unigram = list(set([stemmer.stem(i.lower()) for i in keywords \
if i.lower() not in stop]))
keywords = keywords_unigram+list(set(keywords_bigram))+keywords_trigram+keywords_4gram
tf = []
if not passage_list:
return 0
pass_len = []
#n_u_t=[]
#key_dict = {i: keywords.count(i) for i in keywords}
print('Extraction complete')
#remove_pass={}
#for n,i in enumerate(passage_list):
#if len(i)<200 or not i:
#remove_pass[article_list[n]]=i
#print(n, article_list[n])
#passage_list=[i for i in passage_list if i not in remove_pass.values()]
#article_list=[i for i in article_list if i not in remove_pass.keys()]
passage_list_copy = passage_list.copy()
article_list_copy = article_list.copy()
for i in range(len(passage_list_copy)):
if passage_list.count(passage_list_copy[i]) > 1:
passage_list.remove(passage_list_copy[i])
article_list.remove(article_list_copy[i])
print('Copy:', article_list_copy[i])
del passage_list_copy
del article_list_copy
for n, i in enumerate(passage_list):
temp_tf = {}
c = 0
for j in keywords:
temp_tf[j], temp_pass, temp_len = termFrequency(j, i + ' ' + article_list[n])
if temp_tf[j]:
c += 1
normalize_passage_list.append(temp_pass)
pass_len.append(temp_len)
temp_tf['key_match'] = c
tf.append(temp_tf)
print(pass_len)
print(keywords)
idf = {}
for i in keywords:
idf[i] = inverseDocumentFrequency(i, normalize_passage_list)
#print(tf, idf)
tfidf = []
#b=0.333 #for PLN
b, k = 0.75, 1.2 #for BM25
avg_pass_len = sum(pass_len)/len(pass_len)
#pivot=sum(n_u_t)/len(n_u_t)
for n, i in enumerate(tf):
tf_idf = 0
#avg_tf=sum(i.values())/len(i)
key_match_ratio = i['key_match']/len(keywords)
for j in keywords:
#tf_idf+=idf[j]*((log(1+log(1+i[j])))/(1-b+(b*pass_len[n]/avg_pass_len))) #PLN
tf_idf += idf[j]*(((k+1)*i[j])/(i[j]+k*(1-b+(b*pass_len[n]/avg_pass_len)))) #BM25
tfidf.append(tf_idf*key_match_ratio)
tfidf = [i/sum(tfidf)*100 for i in tfidf if any(tfidf)]
if not tfidf:
return 0, 0, 0, 0, 0
print(tfidf)
print(article_list, len(passage_list))
if len(passage_list) > 1:
sorted_tfidf = sorted(tfidf, reverse=1)
idx1 = tfidf.index(sorted_tfidf[0])
passage1 = passage_list[idx1]
#article_title=
tfidf1 = sorted_tfidf[0]
idx2 = tfidf.index(sorted_tfidf[1])
passage2 = passage_list[idx2]
article_title = (article_list[idx1], article_list[idx2])
tfidf2 = sorted_tfidf[1]
else:
article_title = 0
tfidf2 = 0
if passage_list:
passage1 = passage_list[0]
tfidf1 = tfidf[0]
passage2 = 0
else:
passage1 = 0
passage2 = 0
tfidf1, tfidf2 = 0, 0
end = time.time()
print('TFIDF time:', end-start)
return passage1, passage2, article_title, tfidf1, tfidf2
def passage_pre(self, passage):
#passage=re.findall("[\da-zA-z\.\,\'\-\/\–\(\)]*", passage)
passage = re.sub('\n', ' ', passage)
passage = re.sub('\[[^\]]+\]', '', passage)
passage = re.sub('pronunciation', '', passage)
passage = re.sub('\\\\.+\\\\', '', passage)
passage = re.sub('{.+}', '', passage)
passage = re.sub(' +', ' ', passage)
return passage
def wiki(self, question, topic=''):
if not question:
return 0
question = re.sub(' +', ' ', question)
question = question.title()
key.extract_keywords_from_text(question)
self.ques_key = key.get_ranked_phrases()
doc = NLP(question)
self.noun = [str(i).lower() for i in doc.noun_chunks if str(i).lower() not in wh_words]
print(self.ques_key, self.noun)
question = re.sub('[{0}]'.format(symbol), ' ', question)
if not self.noun + self.ques_key:
return 0
article_list = None
question = question.lower()
if self.noun:
if len(self.noun) == 2 and len(" ".join(self.noun).split()) < 6:
#question1=question
self.noun = " ".join(self.noun).split()
if self.noun[0] in stop:
self.noun.pop(0)
self.noun = question[question.index(self.noun[0]):question.index(self.noun[-1]) \
+len(self.noun[-1])+1].split()
#del question1
print(self.noun)
article_list = self.wiki_search_api(' '.join(self.noun))
if self.ques_key and not article_list:
article_list = self.wiki_search_api(self.ques_key[0])
if not article_list:
article_list = self.wiki_search_api(' '.join(self.ques_key))
if not article_list:
print('Article not found on wikipedia.')
return 0, 0
article_list = list(set(article_list))
passage1, passage2, article_title, tfidf1, tfidf2 = self.sorting(article_list,
question, topic)
if passage1:
ans1, conf1 = self.extractAnswer_model(passage1, question, s=0.20, e=0.20, wiki=True)
else:
ans1, conf1 = 0, 0
if ans1:
conf2 = 0
if len(ans1) > 600:
print(ans1)
print('Repeat')
ans1, conf1 = self.extractAnswer_model(ans1, question, s=0.20, e=0.20, wiki=True)
threshhold = 0.3 if not ((tfidf1- tfidf2) <= 10) else 0.2
if round(tfidf1- tfidf2) < 5:
threshhold = 0
if (tfidf1- tfidf2) > 20:
threshhold = 0.35
if (tfidf1- tfidf2) > 50:
threshhold = 1
if (passage2 and conf1 < 1.5) or (tfidf1 - tfidf2) < 10:
ans2, conf2 = self.extractAnswer_model(passage2, question, s=0.20, e=0.20,
wiki=True) if passage2 else (0, 0)
title = 0
if round(conf1, 2) > round(conf2, 2) - threshhold:
print('ans1')
ans = ans1
title = article_title[0] if article_title else 0
else:
print('ans2')
title = article_title[1] if article_title else 0
ans = ans2
if not question[-1] == '?':
question = question+'?'
try:
ans = sent_formation(question, ans)
except:
print(traceback.format_exc())
print(ans, '\n', '\n', article_title)
return ans, title
extractor = extractAnswer()
app = Flask(__name__)
#app.route("/", methods=["POST", "get"])
#app.route("/ans")
def ans():
start = time.time()
question = request.args.get('question')
topic = request.args.get('topic')
passage = request.args.get('passage')
if not question:
return render_template('p.html')
if not topic:
topic = ''
if passage:
answer = extractor.extractAnswer_model(passage, question)
else:
answer, title = extractor.wiki(question, topic)
end = time.time()
if answer:
mytext = str(answer)
language = 'en'
myobj = gTTS(text=mytext, lang=language, slow=False)
myobj.save("welcome.mp3")
# prevName = 'welcome.mp3'
#newName = 'static/welcome.mp3'
#os.rename(prevName,newName)
return render_template('pro.html', answer=answer)
else:
return jsonify(Status='E', Answer=answer, Time=end-start)
#app.route("/audio_del/", methods=["POST", "get"])
def audio_del():
return render_template('p.html');
#app.route("/audio_play/", methods=["POST", "get"])
def audio_play():
os.system("mpg321 welcome.mp3")
return render_template('white.html')
if __name__ == "__main__":
PORT = 7091
HTTP_SERVER = WSGIServer(('0.0.0.0', PORT), app)
print('Running on',PORT, '...')
HTTP_SERVER.serve_forever()
![Output in the terminal for a question I've asked](https://i.stack.imgur.com/6pyv5.jpg)
I came across a possible solution to this after deeply looking into the output returned by the model. Although this, is probably not something you can accurately rely on, it seemed to have done the task in my case:
Note that the text answer which is "best_span_str" is always a subarray of the passage. It spans the range which is stored in "best_span".
i.e., "best_span" contains the start and end index of the answer.
Now, the output data contains a property named "span_end_probs".
"span_end_probs" contains a list of values that correspond to all the words present in the text input.
If you look closely for various inputs, the value is always maximum at one of the indexes within the starting and ending range that "best_span" contains. This value seemed to be very similar to the confidence levels that we need. Let's call this value score. All you need to do now is to try some inputs and find a suitable method to use this score as a metric.
e.g.: if you need a threshold value for some application, you can try a number of test inputs and find a value that is most accurate. In my case, this was around 0.35.
i.e. if score is lesser than 0.35, it prints answer not found and if greater than or equal 0.35, prints string in "best_span_str".
Here's my code snippet:
from allennlp.predictors.predictor import Predictor
passage = '--INPUT PASSAGE--'
question = '--INPUT QUESTION--'
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bidaf-elmo.2021-02-11.tar.gz")
output = predictor.predict(
passage = passage,
question = question
)
score = max(output["span_end_probs"])
if score < 0.35:
print('Answer not found')
else:
print(output["best_span_str"])
You can readily see the example input and output here.

How can I randomly generate strings with this CFG?

I have this code describing a context-free grammar and I'm trying to generate random strings that match it; for example, like this:
"John thinks that Mary hates every green cat"
But my current output is:
[['_S', ['_NP _VP']], ['_NP', ['_Det _Adj _N', '_Det _N', '_Adj _PropN', '_PropN']], ['_VP', ['_Vi', '_Vt _NP', '_Vc _Comp _S']]]
[['_Det', ['the', 'a', 'some', 'any', 'every']], ['_Adj', ['green', 'young', 'tired', 'confused']], ['_N', ['dog', 'cat']], ['_PropN', ['John', 'Mary']], ['_Vi', ['sleeps', 'walks']], ['_Vt', ['loves', 'hates']], ['_Vc', ['says', 'thinks', 'believes']], ['_Comp', ['that']]]
please help!
import random
psg_rules_str = "S → NP VP\n" \
"NP → Det Adj N | Det N | Adj PropN | PropN\n" \
"VP → Vi | Vt NP | Vc Comp S"
terminals_str = "Det → the | a | some | any | every\n" \
"Adj → green | young | tired | confused\n" \
"N → dog | cat\n" \
"PropN → John | Mary\n" \
"Vi → sleeps | walks\n" \
"Vt → loves | hates\n" \
"Vc → says | thinks | believes\n" \
"Comp → that"
psg_rules_list = [a.split("→") for a in psg_rules_str.split("\n")]
for p in psg_rules_list:
p[0] = "_" + p[0].strip()
p[1] = p[1].split("|")
p[1] = ["_" + a.strip().replace(" ", " _") for a in p[1]]
print(psg_rules_list)
# [['_S', ['_NP _VP']], ['_NP', ['_Det _Adj _N', '_Det _N', '_Adj _PropN', '_PropN']], ['_VP', ['_Vi', '_Vt _NP', '_Vc _Comp _S']]]
terminals_list = [a.split("→") for a in terminals_str.split("\n")]
for t in terminals_list:
t[0] = "_" + t[0].strip()
t[1] = t[1].split("|")
t[1] = [a.strip() for a in t[1]]
print(terminals_list)
# [['_Det', ['the', 'a', 'some', 'any', 'every']], ['_Adj', ['green', 'young', 'tired', 'confused']], ['_N', ['dog', 'cat']], ['_PropN', ['John', 'Mary']], ['_Vi', ['sleeps', 'walks']], ['_Vt', ['loves', 'hates']], ['_Vc', ['says', 'thinks', 'believes']], ['_Comp', ['that']]]
def reachTerminals(from_nts, with_rules, with_ts):
from_nts = str.upper("_" + from_nts.replace("_", "").strip().replace(" ", " _"))
rule_tags = [a[0] for a in with_rules]
ts_tags = [a[0] for a in with_ts]
nts_todo = [a for a in rule_tags if a in from_nts]
while nts_todo != list():
tag = nts_todo[0]
wr_index = rule_tags.index(tag)
repl_choices = with_rules[wr_index][1]
nts_todo = [a for a in rule_tags if a in from_nts]
sentence = reachTerminals(from_nts="s", with_rules=psg_rules_list, with_ts=terminals_list)
You nearly have the program working. Here's a way to complete the reachTerminals function:
import random
psg_rules_str = "S → NP VP\n" \
"NP → Det Adj N | Det N | Adj PropN | PropN\n" \
"VP → Vi | Vt NP | Vc Comp S"
terminals_str = "Det → the | a | some | any | every\n" \
"Adj → green | young | tired | confused\n" \
"N → dog | cat\n" \
"PropN → John | Mary\n" \
"Vi → sleeps | walks\n" \
"Vt → loves | hates\n" \
"Vc → says | thinks | believes\n" \
"Comp → that"
psg_rules_list = [a.split("→") for a in psg_rules_str.split("\n")]
for p in psg_rules_list:
p[0] = "_" + p[0].strip()
p[1] = p[1].split("|")
p[1] = ["_" + a.strip().replace(" ", " _") for a in p[1]]
terminals_list = [a.split("→") for a in terminals_str.split("\n")]
for t in terminals_list:
t[0] = "_" + t[0].strip()
t[1] = t[1].split("|")
t[1] = [a.strip() for a in t[1]]
def reachTerminals(from_nts, with_rules, with_ts):
from_nts = str.upper("_" + from_nts.replace("_", "").strip().replace(" ", " _"))
rule_tags = [a[0] for a in with_rules]
ts_tags = [a[0] for a in with_ts]
nts_todo = [a for a in rule_tags if a in from_nts]
while nts_todo:
for tag in nts_todo:
wr_index = rule_tags.index(tag)
repl_choices = with_rules[wr_index][1]
choice = random.choice(repl_choices)
from_nts = from_nts.replace(tag, choice, 1)
nts_todo = [a for a in rule_tags if a in from_nts]
ts_todo = [a for a in ts_tags if a in from_nts]
while ts_todo:
for tag in ts_todo:
wr_index = ts_tags.index(tag)
repl_choices = with_ts[wr_index][1]
choice = random.choice(repl_choices)
from_nts = from_nts.replace(tag, choice, 1)
ts_todo = [a for a in ts_tags if a in from_nts]
return from_nts
print(reachTerminals(from_nts = "s", with_rules = psg_rules_list, with_ts = terminals_list))
The important tools for you to use are the random.choice function and the str.replace function's third parameter, which lets you only replace the first occurrence of the substring. I haven't thoroughly tested the code, but it seems to be working as expected. Example outputs:
green John loves some confused dog
Mary says that the tired dog says that some green cat hates some cat
every green dog loves young John
John loves the tired cat

How to standardize address type properly

I'm trying to standardize street address by converting the abbreviations to the full word (e.g. RD - Road). I created many lines to account for different spellings and ran into an issue where one replace code overrode another one
import pandas as pd
mydata = {'Street_type': ['PL', 'pl', 'Pl', 'PLACE', 'place']}
mydata = pd.DataFrame(mydata)
mydata['Street_type'] = mydata['Street_type'].replace('PL','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('pl','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('Pl','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('PLACE','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('place','Place',regex=True)
Instead of Place, I got Placeace. What is the best way to avoid this error? Do I write a if-else statement or any function? Thanks in advance!
Among other problems, you have overlapping logic: you fail to check that the target ("old") string is a full word before you replace it. For instance, with the input type of "PLACE", you trigger both the first and third replacements, generating PlaceACE and then PlaceaceACE before you get to the condition you wanted.
You need to work through your tracking and exclusion logic carefully, and then apply only one of the replacements. You can check the length of the street_type and apply the unique transition you need for that length.
If you're trying to convert a case statement, then you need to follow that logic pattern, rather than the successive applications you coded. You can easily look up how to simulate a "case" statement in Python.
Also consider using a translation dictionary, such as
type_trans = {
"pl": "Place",
"Pl": "Place",
"PLACE": "Place",
...
}
Then your change is simply
mydata['Street_type'] = type_trans[mydata['Street_type']]
Also, you might list all of the variants in a tuple, such as:
type_place = ("PL", "Pl", "pl", "PLACE", "place")
if mydata['Street_type'] in type_place
mydata['Street_type'] = "Place"
... but be sure to generalize this properly for your entire list of street types.
You can do this correctly with a single pass if you use a proper regex here, e.g. use word boundaries (\b):
In [11]: places = ["PL", "pl", "Pl", "PLACE", "Place", "place"]
In [12]: mydata.Street_type
Out[12]:
0 PL
1 pl
2 Pl
3 PLACE
4 place
Name: Street_type, dtype: object
In [13]: mydata.Street_type.replace("(^|\b)({})(\b|$)".format("|".join(places)), "Place", regex=True)
Out[13]:
0 Place
1 Place
2 Place
3 Place
4 Place
Name: Street_type, dtype: object
#Needlemanwunch
def zeros(shape):
retval = []
for x in range(shape[0]):
retval.append([])
for y in range(shape[1]):
retval[-1].append(0)
return retval
match_award = 10
mismatch_penalty = -3
gap_penalty = -4 # both for opening and extanding
def match_score(alpha, beta):
if alpha == beta:
return match_award
elif alpha == '-' or beta == '-':
return gap_penalty
else:
return mismatch_penalty
def finalize(align1, align2):
align1 = align1[::-1] #reverse sequence 1
align2 = align2[::-1] #reverse sequence 2
i,j = 0,0
#calcuate identity, score and aligned sequeces
symbol = ''
found = 0
score = 0
identity = 0
for i in range(0,len(align1)):
# if two AAs are the same, then output the letter
if align1[i] == align2[i]:
symbol = symbol + align1[i]
identity = identity + 1
score += match_score(align1[i], align2[i])
# if they are not identical and none of them is gap
elif align1[i] != align2[i] and align1[i] != '-' and align2[i] != '-':
score += match_score(align1[i], align2[i])
symbol += ' '
found = 0
#if one of them is a gap, output a space
elif align1[i] == '-' or align2[i] == '-':
symbol += ' '
score += gap_penalty
identity = float(identity) / len(align1) * 100
print('Similarity =', "%3.3f" % identity, 'percent')
print('Score =', score)
# print(align1)
# print(symbol)
# print(align2)
def needle(seq1, seq2):
m, n = len(seq1), len(seq2) # length of two sequences
# Generate DP table and traceback path pointer matrix
score = zeros((m+1, n+1)) # the DP table
# Calculate DP table
for i in range(0, m + 1):
score[i][0] = gap_penalty * i
for j in range(0, n + 1):
score[0][j] = gap_penalty * j
for i in range(1, m + 1):
for j in range(1, n + 1):
match = score[i - 1][j - 1] + match_score(seq1[i-1], seq2[j-1])
delete = score[i - 1][j] + gap_penalty
insert = score[i][j - 1] + gap_penalty
score[i][j] = max(match, delete, insert)
# Traceback and compute the alignment
align1, align2 = '', ''
i,j = m,n # start from the bottom right cell
while i > 0 and j > 0: # end toching the top or the left edge
score_current = score[i][j]
score_diagonal = score[i-1][j-1]
score_up = score[i][j-1]
score_left = score[i-1][j]
if score_current == score_diagonal + match_score(seq1[i-1], seq2[j-1]):
align1 += seq1[i-1]
align2 += seq2[j-1]
i -= 1
j -= 1
elif score_current == score_left + gap_penalty:
align1 += seq1[i-1]
align2 += '-'
i -= 1
elif score_current == score_up + gap_penalty:
align1 += '-'
align2 += seq2[j-1]
j -= 1
# Finish tracing up to the top left cell
while i > 0:
align1 += seq1[i-1]
align2 += '-'
i -= 1
while j > 0:
align1 += '-'
align2 += seq2[j-1]
j -= 1
finalize(align1, align2)
needle('kizlerlo','killerpo' )
***********************************************************************************************************************
#import textdistance as txd
import numpy
txd.overlap('kizlerlo','kilerpo' )
txd.jaro('kizlerlo','killerpo' )
txd.cosine('kizlerlo','killerpo' )
#txd.needleman_wunsch('kizlerlo','killerpo' )
txd.jaro_winkler('kizlerlo','killerpo' )
#txd.smith_waterman('Loans and Accounts','Loans Accounts' )
#txd.levenshtein.normalized_similarity('Loans and Accounts','Loans Accounts' )
from scipy.spatial import distance
a = 'kizlerlo'
b = 'kilerpoo'
#txd.gotoh('Loans and Accounts','Loans Accounts' )
print(txd.needleman_wunsch.normalized_similarity('Loans and Accounts','Loans Accounts' ))
***************************************************************************************************************************
#Euclidean
import math
import numpy as np
def euclid(str1,str2):
dist=0.0
x=str1
y=str2
set1=set()
for a in range(0,len(x)):
set1.add(x[a])
for a in range(0,len(y)):
set1.add(y[a])
vec1=[None]*len(set1)
vec2=[None]*len(set1)
for counter,each_char in enumerate(set1):
vec1[counter]=x.count(each_char)
vec2[counter]=y.count(each_char)
dist=1/(1+math.sqrt(sum([(a - b) ** 2 for a, b in zip(vec1, vec2)])))
print(dist)
euclid('kizlerlo','killerpo')
***************************************************************************************************************************
from similarity.qgram import QGram
import affinegap
qgram = QGram(2)
#print(qgram.distance('kizlerlo', 'killerpo'))
affinegap.affineGapDistance('kizlerlokill' ,'erpozlerlzler')
***************************************************************************************************************************
#manhattan
def manhattan(str1,str2):
dist=0.0
x=str1
y=str2
set1=set()
for a in range(0,len(x)):
set1.add(x[a])
for a in range(0,len(y)):
set1.add(y[a])
vec1=[None]*len(set1)
vec2=[None]*len(set1)
for counter,each_char in enumerate(set1):
vec1[counter]=x.count(each_char)
vec2[counter]=y.count(each_char)
#dist= sum([np.abs(a - b) for a, b in zip(vec1, vec2)])
dist=1/(1+sum([np.abs(a - b) for a, b in zip(vec1, vec2)]))
print(dist)
manhattan('kizlerlo','killerpo')
import jellyfish
import json
from Levenshtein import distance,jaro_winkler,jaro,ratio,seqratio
def comp(a,b):
return jellyfish.jaro_winkler(a,b)*100 + distance(a,b) + jaro(a,b)*100
ip = {"CED":"WALMART INC_10958553"}
ala = {}
for index,row in df_ala.iterrows():
a = ip.get("CED")
b = row['NN_UID']
c = comp(a,b)
ala.update({row['N_UID'] : c})
ala_max = max(ala, key=ala.get)
ala_f = {"ALACRA" : ala_max}
ces_f = {"CESIUM" : "WALMART_10958553_CESIUM"}
dun_f = {"DUNS" : "WALMART_10958053_DUNS"}
ref_f = {"REF" : "WALMART INC_10958553_REF"}
cax_f = {"CAX" : "WALMART LTD_10958553_CAX"}
final_op = {**ala_f,**ces_f,**dun_f,**ref_f,**cax_f }
final_json = json.dumps(final_op)
print(final_json)
from flask import Flask,request, jsonify
app = Flask(__name__)
#app.route('/test',methods = ['GET','POST'])
def test():
if request.method == "GET":
return jsonify({"response":"Get request called"})
elif request.method == "POST":
req_Json = request.json
name = req_Json['name']
return jsonify({"response": "Hi" + name})
if __name__ == '__main__':
app.run(debug = True,port = 9090)
{
"name": "Mike"
}
import usaddress
import pandas as pd
import statistics
#sa = dict(usaddress.parse('123 Main St. Suite Chicago, IL' ))
adr = pd.read_excel('C:\\VINAYAK\\Address.xlsx')
adr.columns = ['Address']
strlen = []
scr = []
loop = adr['Address'].tolist()
for i in loop:
strlen.append(len(i))
x = statistics.median(strlen)
for i in loop:
sa = dict(usaddress.parse(i))
sa = list(sa.values())
a = 0
if len(i) > x :
a+= 5
if 'AddressNumber' in sa :
a+= 23
if 'StreetName' in sa :
#a = a + 20
a+= 17
if 'OccupancyType' in sa :
a+= 6
if 'OccupancyIdentifier' in sa :
a+= 12
if 'PlaceName' in sa :
a+= 12
if 'StateName' in sa :
a+= 13
if 'ZipCode' in sa :
a+= 12
scr.append(a)
adr['Adr_Score'] = scr
adr.head()
#(pd.DataFrame([(key) for key in sa.items()])).transpose()
#pd.DataFrame(dict([(value, key) for key, value in sa.items()]))
#pd.DataFrame(dict([(value, key) for key, value in sa.items()]))
# df_ts = pd.DataFrame(columns = ['AddressNumber' , 'Age', 'City' , 'Country'])
# df_ts.append(sa, ignore_index=False, verify_integrity=False, sort=None)
# df_ts.head()
import pandas as pd
from zipfile import ZipFile
# core = []
# f = open('C:/Users/s.natarajakarayalar/1.txt','r')
# core.append(str(f.readlines()))
# print(core)
import os
import zipfile
import re
import nltk
import os
core = []
with zipfile.ZipFile('C:/Users/s.natarajakarayalar/TF.zip') as z:
a = 0
for filename in z.namelist():
#if a < 1:
#if not os.path.isdir(filename):
# read the file
with z.open(filename) as f:
#a = 2
x = f.readlines()
core = core + x
with open('C:/Users/s.natarajakarayalar/fins.txt', 'w') as f:
for item in core:
f.write("%s\n" % item)
# for i in core:
# if k < 5:
# tkt = re.sub(r'.*CONTENT', '', i)
# new_core.append(tkt)
# k = k+1
# for item in core:
# new_core.append(len(item.split()))
# print(sum(new_core))
# from nltk.tokenize import word_tokenize
# new_core = []
# stp = ['URL:https://','TITLE:b','META-KEYWORDS:','None','DOC ID:','CONTENT:b','URL:','TITLE:','META-CONTENT:']
# #new_core = [word for word in core if word not in stopwords]
# for i in core:
# wk = word_tokenize(i)
# for w in wk:
# if w not in stp:
# new_core.append(w)

how to calculate std dev in file

i have input file like this
#a
CTCTCTTTAGAACAATCATCACATACCCCTGGTT
+
1>1A>FDF3113B11BB1BAED1D11BAEE0ABG
#b
CAATGCAGCGAACAATGTTCTGGTGGTTGAATTT
+
111>1#11C?1AE?BFDGGGGGBGCCGGGGHHHH
....
i calculated the avg quality scores in this file but need help in calculating the standard
deviation of that score
here what i did
import sys,csv
import numpy as np
import math
r = open(sys.argv[1],"r")
length =200
a = np.zeros(length)
b = np.zeros(length)
av = np.zeros(length)
while True:
id = reads.readline().rstrip()
if id == "":
break
seq= r.readline().rstrip()
p = r.readline().rstrip()
qua = r.readline().rstrip()
l= len(qua)
q = [ord(character) - 33 for character in qua]
for i in range(l):
a[i] += q[i]
b[i] += 1
for j in range(length):
av[j] = float(a[j]) / b[j]
s2 = sum((x - av[j])**2 for x in q[i])/b[j] # error in this
standard_deviation = s2**0.5
print standard_deviation
How about this?
import sys,csv
import numpy as np
import math
r = open(sys.argv[1],"r")
while True:
id = reads.readline().rstrip()
if id == "":
break
seq= r.readline().rstrip()
p = r.readline().rstrip()
qua = r.readline().rstrip()
q = [ord(character) - 33 for character in qua]
mean = np.mean(q)
standard_deviation = np.std(q)

Categories

Resources