comparing monary with pymongo for accessing mongodb - python

I googled for dealing with mongodb in python and the following came up
https://bitbucket.org/djcbeach/monary/wiki/Home
Based on this link monary suppose outperform pymongo, but the comparisons in this link were from queries, we want to see the difference in writing to mongodb (insert) so we made the following according to example code from link
from monary import Monary
from monary import monary_param as mp
import numpy as np
import time
NUM_BATCHES = 3500
BATCH_SIZE = 200
start = time.time()
types = ["float64"] * 5
fields = ["x1", "x2", "x3", "x4", "x5"]
global params
with Monary("127.0.0.1") as monary:
for i in xrange(NUM_BATCHES):
for l in xrange(BATCH_SIZE):
stuff = [ ]
for j in xrange(5):
record = dict(x1=random.uniform(0, 1),
x2=random.uniform(0, 2),
x3=random.uniform(0, 3),
x4=random.uniform(0, 4),
x5=random.uniform(0, 5)
)
stuff.append(record)
params = mp.MonaryParam.from_lists(np.array(stuff), fields)
monary.insert('mydb','collection',params)
end =time.time()
print 'Total time elapsed: %02d:%02d'% divmod((end - start), 60)
Why we keep getting errors?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-58-9ad6ec8c81e5> in <module>()
25 stuff.append(record)
26 #print len(np.array(stuff.append(record)))
---> 27 params = mp.MonaryParam.from_lists(np.array(stuff), fields)
28 monary.insert('mydb','collection',params)
29 # with Monary("127.0.0.1") as monary:
/Users/kelvin/anaconda/envs/gl-env/lib/python2.7/site-packages/monary/monary_param.pyc in from_lists(cls, data, fields, types)
77 raise ValueError(
78 "Data and fields must be of equal length.")
---> 79 return cls.from_groups(zip(data, fields))
80 else:
81 if not (len(data) == len(fields) == len(types)):
/Users/kelvin/anaconda/envs/gl-env/lib/python2.7/site-packages/monary/monary_param.pyc in from_groups(cls, groups)
93 - `groups`: List of items to be passed to MonaryParam.
94 """
---> 95 return list(map(lambda x: cls(x), groups))
96
97 def __len__(self):
/Users/kelvin/anaconda/envs/gl-env/lib/python2.7/site-packages/monary/monary_param.pyc in <lambda>(x)
93 - `groups`: List of items to be passed to MonaryParam.
94 """
---> 95 return list(map(lambda x: cls(x), groups))
96
97 def __len__(self):
/Users/kelvin/anaconda/envs/gl-env/lib/python2.7/site-packages/monary/monary_param.pyc in __init__(self, array, field, mtype)
39 if len(array) == 2:
40 array, field = array
---> 41 mtype = str(array.data.dtype)
42 else:
43 array, field, mtype = array
AttributeError: 'dict' object has no attribute 'data

Related

How to use tabulate library?

I am trying to use tabulate with the zip_longest function. So I have it like this:
from __future__ import print_function
from tabulate import tabulate
from itertools import zip_longest
import itertools
import locale
import operator
import re
50 ="['INGBNL2A, VAT number: NL851703884B01 i\nTel, +31 (0}1 80 61 88 \n\nrut ard wegetables\n\x0c']"
fruit_words = ['Appels', 'Ananas', 'Peen Waspeen',
'Tomaten Cherry', 'Sinaasappels',
'Watermeloenen', 'Rettich', 'Peren', 'Peen', 'Mandarijnen', 'Meloenen', 'Grapefruit']
def total_amount_fruit_regex(format_=re.escape):
return r"(\d*(?:\.\d+)*)\s*~?=?\s*(" + '|'.join(
format_(word) for word in fruit_words) + ')'
def total_fruit_per_sort():
number_found = re.findall(total_amount_fruit_regex(), verdi50)
fruit_dict = {}
for n, f in number_found:
fruit_dict[f] = fruit_dict.get(f, 0) + int(n)
result = '\n'.join(f'{key}: {val}' for key, val in fruit_dict.items())
return result
def fruit_list(format_=re.escape):
return "|".join(format_(word) for word in fruit_words)
def findallfruit(regex):
return re.findall(regex, verdi50)
def verdi_total_number_fruit_regex():
return rf"(\d*(?:\.\d+)*)\s*\W+(?:{fruit_list()})"
def show_extracted_data_from_file():
regexes = [
verdi_total_number_fruit_regex(),
]
matches = [findallfruit(regex) for regex in regexes]
fruit_list = total_fruit_per_sort().split("\n")
return "\n".join(" \t ".join(items) for items in zip_longest(tabulate(*matches, fruit_list, headers=['header','header2'], fillvalue='', )))
print(show_extracted_data_from_file())
But then I get this error:
TypeError at /controlepunt140
tabulate() got multiple values for argument 'headers'
So how to improve this?
So if you remove the tabulate function. Then the format looks like this:
16 Watermeloenen: 466
360 Appels: 688
6 Sinaasappels: 803
75
9
688
22
80
160
320
160
61
So expected output is with headers:
header1 header2
------- -------
16 Watermeloenen: 466
360 Appels: 688
6 Sinaasappels: 803
75
9
688
22
80
160
320
160
61
Like how it works in tabulate.
You should be passing a single table to the tabulate() function, passing multiple lists results in the TypeError: tabulate() got multiple values for argument 'headers' you are seeing.
Updating your return statement -
def show_extracted_data_from_file():
regexes = [
verdi_total_number_fruit_regex(),
]
matches = [findallfruit(regex) for regex in regexes]
fruit_list = total_fruit_per_sort().split("\n")
return tabulate(zip_longest(*matches, fruit_list), headers=['header1','header2'])
Output:
header1 header2
--------- ------------------
16 Watermeloenen: 466
360 Appels: 688
6 Sinaasappels: 803
75
9
688
22
80
160
320
160
61

STANZA/ RuntimeError: Integer division of tensors using div or / is no longer supported

I want to use stanza for tokenizing, pos tagging and parsing some text I have, but it keeps giving me this error. I've tried changing the way a I call it but nothing happens. Any ideas?
My code(Here a iterate through a list of list of text and appli stanza to each one)
t = time()
data_stanza = []
for text in data:
stz = apply_stanza(text[0])
data_stanza.append(stz)
print('Time to run: {} mins'.format(round((time() - t) / 60, 2)))
This is the function I use to apply_stanza to each text:
nlp = stanza.Pipeline('pt')
def apply_stanza(text):
doc = nlp(text)
All = []
for sent in doc.sentences:
for word in sent.words:
All.append((word.id,word.text,word.lemma,word.upos,word.feats,word.head,word.deprel))
return All
The error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-17-7ac303eec8e8> in <module>
3 data_staza = []
4 for text in data:
----> 5 stz = apply_stanza(text[0])
6 data_stanza.append(stz)
7
<ipython-input-16-364c3ac30f32> in apply_stanza(text)
2
3 def apply_stanza(text):
----> 4 doc = nlp(text)
5 All = []
6 for sent in doc.sentences:
~\anaconda3\lib\site-packages\stanza\pipeline\core.py in __call__(self, doc)
174 assert any([isinstance(doc, str), isinstance(doc, list),
175 isinstance(doc, Document)]), 'input should be either str, list or Document'
--> 176 doc = self.process(doc)
177 return doc
178
~\anaconda3\lib\site-packages\stanza\pipeline\core.py in process(self, doc)
168 for processor_name in PIPELINE_NAMES:
169 if self.processors.get(processor_name):
--> 170 doc = self.processors[processor_name].process(doc)
171 return doc
172
~\anaconda3\lib\site-packages\stanza\pipeline\mwt_processor.py in process(self, document)
31 preds = []
32 for i, b in enumerate(batch):
---> 33 preds += self.trainer.predict(b)
34
35 if self.config.get('ensemble_dict', False):
~\anaconda3\lib\site-packages\stanza\models\mwt\trainer.py in predict(self, batch, unsort)
77 self.model.eval()
78 batch_size = src.size(0)
---> 79 preds, _ = self.model.predict(src, src_mask, self.args['beam_size'])
80 pred_seqs = [self.vocab.unmap(ids) for ids in preds] # unmap to tokens
81 pred_seqs = utils.prune_decoded_seqs(pred_seqs)
~\anaconda3\lib\site-packages\stanza\models\common\seq2seq_model.py in predict(self, src, src_mask, pos, beam_size)
259 done = []
260 for b in range(batch_size):
--> 261 is_done = beam[b].advance(log_probs.data[b])
262 if is_done:
263 done += [b]
~\anaconda3\lib\site-packages\stanza\models\common\beam.py in advance(self, wordLk, copy_indices)
82 # bestScoresId is flattened beam x word array, so calculate which
83 # word and beam each score came from
---> 84 prevK = bestScoresId / numWords
85 self.prevKs.append(prevK)
86 self.nextYs.append(bestScoresId - prevK * numWords)
RuntimeError: Integer division of tensors using div or / is no longer supported, and in a future release div will perform
true division as in Python 3. Use true_divide or floor_divide (// in Python) instead.
ATT: It turns after all that it was and error with the mwt module of stanza pipeline, so I just specified not to use it.
Use // for division instead of /.
Try to edit your code as follows:
print('Time to run: {} mins'.format(round((time() - t) // 60, 2)))
Using floor division (//) will floor the result to the largest possible integer.
Using torch.true_divide(Dividend, Divisor) or numpy.true_divide(Dividend, Divisor) in stead.
For example: 3/4 = torch.true_divide(3, 4)
https://pytorch.org/docs/stable/generated/torch.true_divide.html
https://numpy.org/doc/stable/reference/generated/numpy.true_divide.html

JupyterLab does not adopt changes in a function from an imported script

I'm using JupyterLab to run the script for analysing my data. In this script, called 'analysis.ipynb', I'm using a function out of the script 'def_readfb_dwd.py'. This function contained a little error (a wrong parameter 'conf_flag_fb_aeolus' was included) which I corrected. But the correction has not been adopted... when I run the analysis script again I got the same error message:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-78-39237ed9c16d> in <module>
----> 1 data_dwd = read_multiple_files(file_list,date_list,time_list,channel)
2 print(channel + ': ' + data_str, + ' ---> data loaded')
~/PhD/code/functions/def_readfb_dwd.py in read_multiple_files(file_list, date_list, time_list, retrtype)
106
107 def read_multiple_files(file_list,date_list,time_list,retrtype):
--> 108 combined_data = readin_fb_files(file_list[0],date_list[0],time_list[0],retrtype)
109 if len(file_list)>1:
110 for f in range(1,len(file_list)):
~/PhD/code/functions/def_readfb_dwd.py in readin_fb_files(file, d, anatime, retrtype)
80 ### filter data for time and status-active
81 filter_retrtype = np.where((data['retrtype'] == retrtype))[0]
---> 82 filter_flags = np.where((data['r_state'] == 1) & (data['r_check'] == 32))[0]
83 filt_ind = np.intersect1d(filter_retrtype,filter_flags)
84 data['r_state']=data['r_state'][filt_ind]
NameError: name 'conf_flag_fb_aeolus' is not defined
What can I do that changes in scripts, which I import are taken on

seqmining: how to calculate frequency of a sequence on python

I'm trying to use pymining on Python to generate frequent sequences from my dataset. My code below appears to be working well:
from pymining import seqmining
seqs = ( 'caabc', 'abcb', 'cabc', 'abbca')
freq_seqs = seqmining.freq_seq_enum(seqs, 2)
sorted(freq_seqs)
However, when i want to use it with my dataset:
import numpy as np
import pandas as pd
from pymining import seqmining
def importdata():
filename = pd.read_csv('C:/Users/asus/Desktop/memoire/sequences-code.csv', sep= ';', header = None)
data=importdata()
seqs = data
freq_seqs = seqmining.freq_seq_enum(seqs, 2)
sorted(freq_seqs)
I get this error:
TypeError: 'NoneType' object is not iterable
this is all the error:
TypeError Traceback (most recent call last)
<ipython-input-4-19e2af14465a> in <module>()
8 data=importdata()
9 seqs = data
---> 10 freq_seqs = seqmining.freq_seq_enum(seqs, 2)
11 sorted(freq_seqs)
12
~\Anaconda3\lib\site-packages\pymining\seqmining.py in freq_seq_enum(sequences, min_support)
9 '''
10 freq_seqs = set()
---> 11 _freq_seq(sequences, tuple(), 0, min_support, freq_seqs)
12 return freq_seqs
13
~\Anaconda3\lib\site-packages\pymining\seqmining.py in _freq_seq(sdb, prefix, prefix_support, min_support, freq_seqs)
16 if prefix:
17 freq_seqs.add((prefix, prefix_support))
---> 18 locally_frequents = _local_freq_items(sdb, prefix, min_support)
19 if not locally_frequents:
20 return
~\Anaconda3\lib\site-packages\pymining\seqmining.py in _local_freq_items(sdb, prefix, min_support)
28 items = defaultdict(int)
29 freq_items = []
---> 30 for entry in sdb:
31 visited = set()
32 for element in entry:
TypeError: 'NoneType' object is not iterable
The simplest change you can make to your code is to get rid of importdata, which is just a wrapper on pd.read_csv. Try:
filename = 'C:/Users/asus/Desktop/memoire/sequences-code.csv'
data = pd.read_csv(filename, sep=';', header=None)
Let me know if that helps.

Getting error: AttributeError: class <CLASS NAME> has no attribute '<METHOD NAME>'

Here is the code:
1 #!/usr/bin/env python
2
3 import re, os, sys, jira, subprocess
4
5 class Check_jira:
6
7 def verify_commit_text(self, tags):
8 for line in tags:
9 if re.match('^NO-TIK',line):
10 return True
11 elif re.match('^NO-REVIEW', line):
12 return True
13 elif re.match(r'[a-zA-Z]+-\d+', line):
14 # Validate the JIRA ID
15 m = re.search("([a-zA-Z]+-\d+)",line)
16 if m:
17 my_args = m.group(1)
18 result = Check_jira.CheckForJiraIssueRecord(my_args)
19 if result == False:
20 util.warn("%s does not exist"%my_args)
21 else:
22 return True
23 return True
24 else:
25 return False
26 if __name__ == '__main__':
27 p = Check_jira()
28 commit_text_verified = p.verify_commit_text(os.popen('hg tip --template "{desc}"'))
29
30 if (commit_text_verified):
31 sys.exit(0)
32 else:
33 print >> sys.stderr, ('[obey the rules!]')
34 sys.exit(1);
35 def CheckForJiraIssueRecord(object):
36
37 sys.stdout = os.devnull
38 sys.stderr = os.devnull
39
40
41 try:
42 com = jira.Commands()
43 logger = jira.setupLogging()
44 jira_env = {'home':os.environ['HOME']}
45 command_cat= "cat"
46 command_logout= "logout"
47 #my_args = ["QA-656"]
48 server = "http://jira.myserver.com:8080/rpc/soap/jirasoapservice-v2?wsdl"
49 except Exception, e:
50 sys.exit('config error')
51
52 class Options:
53 pass
54 options = Options()
55
56 options.user = 'user'
57 options.password = 'password'
58
59 try:
60
61 jira.soap = jira.Client(server)
62 jira.start_login(options, jira_env, command_cat, com, logger)
63 issue = com.run(command_cat, logger, jira_env, my_args)
64 except Exception, e:
65 print sys.exit('data error')
so maybe:
1. if name == 'main': shoudl be at the bottom ?
2. So, i have 2 classes (Check_jira) and (Options)
3. Check_jira has 2 functions verify_commit_text() and CheckForJiraIssueRecord()
4. I pass object as an argument to CheckForJiraIssueRecord since i am passing my_args to it , on its usage.
5. Not sure how to call one function from another function in the same class
6. Error i am getting is :
Traceback (most recent call last):
File "/home/qa/hook-test/.hg/check_jira.py", line 31, in
commit_text_verified = p.verify_commit_text(os.popen('hg tip --template "{desc}"'))
File "/home/qa/hook-test/.hg/check_jira.py", line 21, in verify_commit_text
result = Check_jira.CheckForJiraIssueRecord(my_args)
AttributeError: class Check_jira has no attribute 'CheckForJiraIssueRecord'
transaction abort!
rollback completed
abort: pretxncommit.jira hook exited with status 1
class Check_jira ends on line 25 and has only one method. Then you have an if block, and CheckForJiraIssueRecord is just a function defined in this block (that is, the function is defined if __name__ == '__main__'.
Just put the if block outside after the whole class definition.

Categories

Resources