I'm trying to convert text into a bunch of skipgrams with the help of Keras's Tokenizer. I have code that works for single sentences, but not for batches of sentences. Namely, when I try to run the method at the very bottom, I get an error message:
> Traceback (most recent call last): File
> "/Users/montana/Documents/Capstone project/skipgrams.py", line 61, in
> word2vec_nn_generator
> question_word_targets,question_word_contexts = sequences_to_skipgrams(question_sequences) File
> "/Users/montana/miniconda3/lib/python3.6/site-packages/numpy/lib/function_base.py",
> line 1972, in __call__
> return self._vectorize_call(func=func, args=vargs) File "/Users/montana/miniconda3/lib/python3.6/site-packages/numpy/lib/function_base.py",
> line 2042, in _vectorize_call
> ufunc, otypes = self._get_ufunc_and_otypes(func=func, args=args) File
> "/Users/montana/miniconda3/lib/python3.6/site-packages/numpy/lib/function_base.py",
> line 2002, in _get_ufunc_and_otypes
> outputs = func(*inputs) File "/Users/montana/Documents/Capstone project/skipgrams.py", line 54, in <lambda>
> sequences_to_skipgrams = np.vectorize(lambda x: sequence_to_skipgrams(x,3,len(textTokenizer.word_index) + 1)) File
> "/Users/montana/Documents/Capstone project/skipgrams.py", line 48, in
> sequence_to_skipgrams
> couples, labels = skipgrams(data, vocab_size, window_size=window_size) File
> "/Users/montana/miniconda3/lib/python3.6/site-packages/keras_preprocessing/sequence.py",
> line 197, in skipgrams
> for i, wi in enumerate(sequence): TypeError: 'numpy.int32' object is not iterable
>
> During handling of the above exception, another exception occurred:
>
> Traceback (most recent call last): File
> "/Users/montana/Documents/Capstone project/skipgrams.py", line 72, in
> <module>
> for i in word2vec_nn_generator(questionsTokenized_train,contextTokenized_train,trainingData["answer_start"],1):
> File "/Users/montana/Documents/Capstone project/skipgrams.py", line
> 65, in word2vec_nn_generator
> raise ValueError("Exception in word2vec_nn_generator.") ValueError: Exception in word2vec_nn_generator. logout Saving
> session... ...copying shared history... ...saving history...truncating
> history files... ...completed.
>
> [Process completed]
What exactly is this error message indicating, and how can I fix it?
import json
import numpy as np
import pandas as pd
import os
assert os.path.isfile("train-v1.1.json"),"Non-existent file"
from tensorflow.python.client import device_lib
import tensorflow.compat.v1 as tf
#import keras
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import re
regex = re.compile(r'\W+')
#Reading the files.
def readFile(filename):
with open(filename) as file:
fields = []
JSON = json.loads(file.read())
articles = []
for article in JSON["data"]:
articleTitle = article["title"]
article_body = []
for paragraph in article["paragraphs"]:
paragraphContext = paragraph["context"]
article_body.append(paragraphContext)
for qas in paragraph["qas"]:
question = qas["question"]
answer = qas["answers"][0]
fields.append({"question":question,"answer_text":answer["text"],"answer_start":answer["answer_start"],"paragraph_context":paragraphContext,"article_title":articleTitle})
article_body = "\\n".join(article_body)
article = {"title":articleTitle,"body":article_body}
articles.append(article)
fields = pd.DataFrame(fields)
fields["question"] = fields["question"].str.replace(regex," ")
assert not (fields["question"].str.contains("catalanswhat").any())
fields["paragraph_context"] = fields["paragraph_context"].str.replace(regex," ")
fields["answer_text"] = fields["answer_text"].str.replace(regex," ")
assert not (fields["paragraph_context"].str.contains("catalanswhat").any())
fields["article_title"] = fields["article_title"].str.replace("_"," ")
assert not (fields["article_title"].str.contains("catalanswhat").any())
return fields,JSON["data"]
trainingData,training_JSON = readFile("train-v1.1.json")
print("JSON dataset read.")
#Text preprocessing
## Converting text to skipgrams
from tensorflow.keras.preprocessing.text import *
from tensorflow.keras.preprocessing.sequence import skipgrams,make_sampling_table
def sequence_to_skipgrams(data,window_size,vocab_size):
sampling_table = make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size)
assert len(couples) > 0
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")
return word_target,word_context
sequences_to_skipgrams = np.vectorize(lambda x: sequence_to_skipgrams(x,3,len(textTokenizer.word_index) + 1))
def word2vec_nn_generator(question_sequences,context_sequences,answer_starts,batch_size):
while True:
sequence_indices = np.random.randint(0,high=question_sequences.shape[0],size=10)
question_sequences = question_sequences[sequence_indices,:]
context_sequences = context_sequences[sequence_indices,:]
try:
question_word_targets,question_word_contexts = sequences_to_skipgrams(question_sequences)
context_word_targets,context_word_contexts = sequences_to_skipgrams(context_sequences)
yield question_word_targets,question_word_contexts,context_word_targets,context_word_contexts,answer_starts
except Exception as e:
raise ValueError("Exception in word2vec_nn_generator.")
strings = trainingData.drop("answer_start",axis=1)
strings = strings.values.flatten()
textTokenizer = Tokenizer()
textTokenizer.fit_on_texts(strings)
questionsTokenized_train = pad_sequences(textTokenizer.texts_to_sequences(trainingData["question"]))
contextTokenized_train = pad_sequences(textTokenizer.texts_to_sequences(trainingData["paragraph_context"]))
for i in word2vec_nn_generator(questionsTokenized_train,contextTokenized_train,trainingData["answer_start"],1):
print(i)
break
This message indicates that tf.keras.preprocessing.sequence.skipgrams function receives a sequence argument in a wrong format.
Check the contents of the data variable in your sequence_to_skipgrams function and compare it with the required format:
sequence: A word sequence (sentence), encoded as a list
of word indices (integers). If using a `sampling_table`,
word indices are expected to match the rank
of the words in a reference dataset (e.g. 10 would encode
the 10-th most frequently occurring token).
Note that index 0 is expected to be a non-word and will be skipped.
Source
The sequence is comming from the question_sequences variable, so, most likely, something is wrong with the way you slice it in the while loop (word2vec_nn_generator function).
I found the error:
np.vectorize is not the function I wanted to use. np.vectorize assumes that it's given a function that takes a single value, such as a number.
What I wanted to do instead was np.apply_along_axis, which makes a different assumption: that the given function takes a 1-D array.
Related
I am trying to simulate two FMUs with the one having inputs as CSV files by using the Master. What I have tried is the following:
from pyfmi import load_fmu
from pyfmi import Master
import pandas as pd
electricity_network = load_fmu(r"C:\Users\kosmy\Pandapower_Reduced.fmu")
pv = load_fmu(r"C:\Users\kosmy\Photovoltaics.Model.PVandWeather_simple.fmu")
load_file = r"C:\Users\kosmy\load_prof_sec_res.csv"
load = pd.read_csv(load_file)
models = [electricity_network, pv]
connections = [(pv, "P_MW", electricity_network, "P_pv1"),
(pv, "P_MW", electricity_network, "P_pv2")]
master_simulator = Master(models, connections)
input_object = [((electricity_network, 'P_load1'), load),
((electricity_network, 'P_load2'), load)]
res = master_simulator.simulate(final_time = 86400, input = input_object)
I am getting the following error:
Traceback (most recent call last):
File "C:\Users\kosmy\run_csv_pyfmi.py", line 29, in <module>
res = master_simulator.simulate(final_time = 86400, input = input_object)
File "src\pyfmi\master.pyx", line 1474, in pyfmi.master.Master.simulate
File "src\pyfmi\master.pyx", line 1369, in pyfmi.master.Master.specify_external_input
TypeError: tuple indices must be integers or slices, not tuple
Apparently, I do not give the correct format to the input, but I have not found an example demonstrating the correct format when using the Master.
Does anyone know how can I use the input in this case?
def load(t):
return 10, math.cos(t)
input_object = ([(electricity_network, 'P_load1'),(electricity_network, 'P_load2')],load)
another option is
data = np.transpose(np.vstack((t,u,v)))
input_object = (['InputVarI','InputVarP'],data)
error while creating a 2-tuple as input for model.simulate() of fmu model with pyfmi
I am trying to crawl abstracts from PubMed and filtering them using regex via python. To speed things up, I wanted to use pythons multiprocessing pool.
My code looks like the following:
import multiprocessing as mp
from functools import partial
from typing import List, Tuple
def beautify_abstract(abstract: str, regex: str):
import re
result: str = ""
last_start = 0
matches = re.finditer(regex, abstract, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
result += abstract[last_start:match.start()]
result += "<b>"
result += abstract[match.start():match.end()]
result += "</b>"
last_start = match.end()
result += abstract[last_start:]
return result
def get_doi(pim: str, regex: str):
from Bio import Entrez
from Bio.Entrez import efetch
import re
from metapub.convert import pmid2doi
Entrez.email = "Your.Name.Here#example.org"
print(f"Processing {pim}")
abstract_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='all')
abstract = abstract_handle.read()
abstract_handle.close()
if re.search(regex, abstract, re.MULTILINE) is not None:
docsum_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='docsum').read()
docsum = docsum_handle.read()
try:
doi = pmid2doi(pim)
except:
doi = "UNKNOWN"
return f"{doi}"
return ""
def get_pim_with_regex_list(keywords: List[str]) -> List[str]:
from Bio import Entrez
Entrez.email = "Your.Name.Here#example.org"
searchterm = " ".join(keywords)
pims = []
handle = Entrez.esearch(db="pubmed", retstart=0, retmax=0, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
count = int(record['Count'])
if count > 100000:
retmax = 100000
else:
retmax = count
retstart = 0
while retstart < count:
handle = Entrez.esearch(db="pubmed", retstart=retstart, retmax=retmax, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
for pim in record['IdList']:
pims.append(pim)
retstart += retmax
return pims
if __name__ == '__main__':
keywords = ["keyword1", "keyword2"]
pim_list = get_pim_with_regex_list(keywords)
regex = "keyword1 keyword2"
worker_fn = partial(get_doi, regex=regex)
pool = mp.Pool(mp.cpu_count())
entries = pool.map(worker_fn, pim_list)
pool.close()
pool.join()
When I run the given code, I get the following error:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
Process ForkPoolWorker-4:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
I did some digging into multiprocessing with python and found out that only python native types are supported as parameters (enforced by the ForkingPickler).
Assuming that str is a native type, the code should work... Currently, I am completely lost and have no idea what may be the problem.
As suggested, I uploaded a minimal (sequential) working example here
Is there any way to fix this problem or at least diagnose the real issue here?
This program is supposed to emit sound based on a CSV file.
There is a frequency range in the dataset of 37-32677. In the beginning I didn't add this in and got this same error message. I tried adding in this range and I am still getting the same error.
import winsound
import csv
winsound.Beep(261,100)
def preload(filename):
file = open(filename)
data = csv.reader(file)
return data
def getNote(sensorVal):
return int(sensorVal * 75)
def setup():
cleanedData = {}
notes = []
data = preload("data1.csv")
for row in data(range(36,32677)):
print(row)
if row[1] != "trial number":
sensorVal = float(row[4])
channel = int(row[7])
if channel not in cleanedData:
cleanedData[channel] = []
cleanedData[channel].append({"sensorVal":sensorVal})
notes.append(getNote(sensorVal))
return cleanedData,notes
def play(notes,time):
for note in notes:
winsound.Beep(note,time)
data, notes = setup()
play(notes, 200)
Error message:
Traceback (most recent call last):
File "C:/Users/clair/PycharmProjects/winSound/main.py", line 32, in <module>
data, notes = setup()
File "C:/Users/clair/PycharmProjects/winSound/main.py", line 16, in setup
for row in data(range(36,32677)):
TypeError: '_csv.reader' object is not callable
Process finished with exit code 1
I am trying to following the steps listed here to update a feature on AGOL from a local feature class. I keep getting a circular reference within the for loop and I'm not sure why it's happening.
Please see the code I'm using below.
import arcgis, arcpy, csv, os, time, copy, pandas as pd
from arcgis.gis import GIS
from pandas import DataFrame
from copy import deepcopy
gis = GIS("url", "username","pass")
fc = gis.content.get('ItemID')
flayer = fc.layers[0]
fset=flayer.query()
fields = ('GPS_Time','Visibility','EngineeringSection','Condition')
UpdateLayer = "C:\\Users\\USer\\Documents\\ArcGIS\\Default.gdb\\Data"
UpdateTable=DataFrame(arcpy.da.FeatureClassToNumPyArray(UpdateLayer , fields, skip_nulls=True))
overlap_rows = pd.merge(left=fset.sdf, right = UpdateTable, how='inner', on='EngineeringSection')
features_for_update = []
all_features = fset.features
for EngSec in overlap_rows['EngineeringSection']:
original_feature = [f for f in all_features if f.attributes['EngineeringSection'] == EngSec][0]
feature_to_be_updated = deepcopy(original_feature)
matching_row = UpdateTable.where(UpdateTable['EngineeringSection'] == EngSec).dropna()
original_feature.attributes['GPS_Time'] = (matching_row['GPS_Time'])
original_feature.attributes['Visibility'] = int(matching_row['Visibility'])
original_feature.attributes['Condition'] = str(matching_row['Condition'])
update_result = flayer.edit_features(updates=[original_feature])
flayer.edit_features(updates= features_for_update)
Here is the error I receive:
Traceback (most recent call last):
File "<stdin>", line 9, in <module>
File "C:\Program Files\ArcGIS\Pro\bin\Python\envs\arcgispro-py3\lib\site-packages\arcgis\features\layer.py", line 1249, in edit_features
default=_date_handler)
File "C:\Program Files\ArcGIS\Pro\bin\Python\envs\arcgispro-py3\lib\json\__init__.py", line 238, in dumps
**kw).encode(obj)
File "C:\Program Files\ArcGIS\Pro\bin\Python\envs\arcgispro-py3\lib\json\encoder.py", line 199, in encode
chunks = self.iterencode(o, _one_shot=True)
File "C:\Program Files\ArcGIS\Pro\bin\Python\envs\arcgispro-py3\lib\json\encoder.py", line 257, in iterencode
return _iterencode(o, 0)
ValueError: Circular reference detected
The line below assign a tuple as an attribute value. Is it what you wanted?
original_feature.attributes['GPS_Time'] = (matching_row['GPS_Time'])
If you want to assign the value just do:
original_feature.attributes['GPS_Time'] = matching_row['GPS_Time']
Also, I think this line:
flayer.edit_features(updates= features_for_update)
Should be:
flayer.edit_features(updates=[feature_to_be_updated])
Thanks for your help, I was able to get it all running with this script:
I also added in some timing to see how long it was taking
import arcpy, csv, os, time
import pandas as pd
from arcgis.gis import GIS
from pandas import DataFrame
from copy import deepcopy
start_time = time.time()
gis = GIS("url", "user","pass")
fc = gis.content.get('ContentID')
flayer = fc.layers[0]
fset=flayer.query()
fields = ('GPS_Time','Visibility','EngineeringSection','Condition')
UpdateLayer = "C:\\Users\\user\\Documents\\ArcGIS\\Default.gdb\\data"
UpdateTable=DataFrame(arcpy.da.FeatureClassToNumPyArray(UpdateLayer , fields, skip_nulls=True))
overlap_rows = pd.merge(left=fset.sdf, right = UpdateTable, how='inner', on='EngineeringSection')
features_for_update = []
all_features = fset.features
for EngSec in overlap_rows['EngineeringSection']:
original_feature = [f for f in all_features if f.attributes['EngineeringSection'] == EngSec][0]
feature_to_be_updated = deepcopy(original_feature)
matching_row = UpdateTable.where(UpdateTable['EngineeringSection'] == EngSec).dropna()
feature_to_be_updated.attributes['GPS_Time'] = matching_row['GPS_Time'].iloc[0]
feature_to_be_updated.attributes['Visibility'] = int(matching_row['Visibility'])
feature_to_be_updated.attributes['Condition'] = str(matching_row['Condition'].iloc[0])
update_result = flayer.edit_features(updates=[feature_to_be_updated])
update_result
elapsed_time = time.time() - start_time
totaltime = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
print("Total processing time: "+ totaltime)
I am currently working on a REST call using Flask Framework. and I have run into some errors along the way, which I am unable to figure out as to why they occur (still trying though). The error is as shown below:
[2016-09-20 18:53:26,486] ERROR in app: Exception on /Recommend [GET]
Traceback (most recent call last):
File "/anaconda/lib/python2.7/site-packages/flask/app.py", line 1988, in wsgi_app
response = self.full_dispatch_request()
File "/anaconda/lib/python2.7/site-packages/flask/app.py", line 1641, in full_dispatch_request
rv = self.handle_user_exception(e)
File "/anaconda/lib/python2.7/site-packages/flask_cors/extension.py", line 161, in wrapped_function
return cors_after_request(app.make_response(f(*args, **kwargs)))
File "/anaconda/lib/python2.7/site-packages/flask_api/app.py", line 97, in handle_user_exception
for typecheck, handler in chain(blueprint_handlers.items(), app_handlers.items()):
AttributeError: 'tuple' object has no attribute 'items'
127.0.0.1 - - [20/Sep/2016 18:53:26] "GET /Recommend HTTP/1.1" 500 -
here is the code that I have built:
from flask import request, jsonify
from flask_api import FlaskAPI
from flask_cors import CORS
from sets import Set
from collections import defaultdict
import itertools
import copy
app = FlaskAPI(__name__)
CORS(app)
content = None
class Apriori:
def __init__(self):
self.no_of_transactions = None
self.min_support = 0.5
self.min_confidence = 0.75
self.transactions = {}
self.set_of_items = set()
self.frequencies = {}
self.frequent_itemsets_of_order_n = {}
self.association_rules = {}
def createPowerSet(self,s):
powerset = set()
for i in xrange(2**len(s)):
subset = tuple([x for j,x in enumerate(s) if (i >> j) & 1])
if len(subset) == 0:
pass
elif len(subset) == 1:
powerset.add(subset[0])
else:
powerset.add(subset)
return powerset
def createFrequentItemSets(self,set_of_items,len):
frequent_itemsets = set(itertools.combinations(set_of_items, len))
for i in list(frequent_itemsets):
tempset = set(i)
self.frequencies[i] = 0
for k, v in self.transactions.iteritems():
if tempset.issubset(set(v)):
self.frequencies[i] += 1
if float(self.frequencies[i])/self.no_of_transactions < self.min_support:
frequent_itemsets.discard(i)
return frequent_itemsets
def mineAssociationRules(self,frequent_itemset):
s = set(frequent_itemset)
subs = list(self.createPowerSet(s))
for each in subs:
if sorted(tuple(set(each))) == sorted(tuple(s)):
continue
if len(set(each))==1:
antecedent = list(set(each))[0]
elif len(set(each))>1:
antecedent = tuple(set(each))
if len(s.difference(set(each)))==1:
consequent = list(s.difference(set(each)))[0]
elif len(s.difference(set(each)))>1:
consequent = tuple(s.difference(set(each)))
AuC = tuple(s)
if float(self.frequencies[AuC])/self.frequencies[antecedent] >= self.min_confidence:
if antecedent in self.association_rules:
pass
else:
if type(antecedent) is tuple:
antecedent = (",").join(antecedent)
if type(consequent) is tuple:
consequent = (",").join(consequent)
self.association_rules[antecedent] = consequent
def implement(self,transactions):
#for i in range(0,self.no_of_transactions):
for i in range(0,len(transactions)):
self.transactions["T"+str(i)] = defaultdict(list)
self.transactions["T"+str(i)] = transactions[i].split(',')
self.set_of_items = self.set_of_items.union(Set(self.transactions["T"+str(i)]))
for i in list(self.set_of_items):
self.frequencies[i] = 0
for k, v in self.transactions.iteritems():
if i in v:
self.frequencies[i] = self.frequencies[i] + 1
if float(self.frequencies[i])/self.no_of_transactions < self.min_support:
self.set_of_items.discard(i)
self.frequent_itemsets_of_order_n[1] = self.set_of_items
l = 1
reps = copy.deepcopy(self.set_of_items)
while True:
l += 1
result = self.createFrequentItemSets(self.set_of_items, l)
if len(result) == 0:
break
self.frequent_itemsets_of_order_n[l] = result
reps = copy.deepcopy(self.frequent_itemsets_of_order_n[l])
l = l-1
while l>2:
for each in self.frequent_itemsets_of_order_n[l]:
self.mineAssociationRules(each)
l = l-1
#app.route('/Recommend')
def FindAssociations():
transactions = ["A,C,D,F,G","A,B,C,D,F","C,D,E","A,D,F","A,C,D,E,F","B,C,D,E,F,G"]
apr = Apriori()
apr.implement(transactions)
return jsonify(rules=apr.association_rules)
if __name__ == "__main__":
app.run(port=5000)
I did run some sample code found on the web, and built the above script based on those scripts. They worked out well. The class I built was based on another python program that I had built earlier which worked well. Should I have imported the class from another script instead of building it here?
There are a number of errors in your Apriori which need to fixed. There are a few attempts to divide by self.no_of_transactions (e.g. line 87) which is initialised to None and never changed. Dividing by None raises an exception:
>>> 1/None
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: unsupported operand type(s) for /: 'int' and 'NoneType'
This exception is then handled in Flask-API's handle_user_exception() method, which also appears to have a bug as shown in the exception it raises.
The way to fix it is to correct your code so that it does not divide by None.