I am trying to use tabulate with the zip_longest function. So I have it like this:
from __future__ import print_function
from tabulate import tabulate
from itertools import zip_longest
import itertools
import locale
import operator
import re
50 ="['INGBNL2A, VAT number: NL851703884B01 i\nTel, +31 (0}1 80 61 88 \n\nrut ard wegetables\n\x0c']"
fruit_words = ['Appels', 'Ananas', 'Peen Waspeen',
'Tomaten Cherry', 'Sinaasappels',
'Watermeloenen', 'Rettich', 'Peren', 'Peen', 'Mandarijnen', 'Meloenen', 'Grapefruit']
def total_amount_fruit_regex(format_=re.escape):
return r"(\d*(?:\.\d+)*)\s*~?=?\s*(" + '|'.join(
format_(word) for word in fruit_words) + ')'
def total_fruit_per_sort():
number_found = re.findall(total_amount_fruit_regex(), verdi50)
fruit_dict = {}
for n, f in number_found:
fruit_dict[f] = fruit_dict.get(f, 0) + int(n)
result = '\n'.join(f'{key}: {val}' for key, val in fruit_dict.items())
return result
def fruit_list(format_=re.escape):
return "|".join(format_(word) for word in fruit_words)
def findallfruit(regex):
return re.findall(regex, verdi50)
def verdi_total_number_fruit_regex():
return rf"(\d*(?:\.\d+)*)\s*\W+(?:{fruit_list()})"
def show_extracted_data_from_file():
regexes = [
verdi_total_number_fruit_regex(),
]
matches = [findallfruit(regex) for regex in regexes]
fruit_list = total_fruit_per_sort().split("\n")
return "\n".join(" \t ".join(items) for items in zip_longest(tabulate(*matches, fruit_list, headers=['header','header2'], fillvalue='', )))
print(show_extracted_data_from_file())
But then I get this error:
TypeError at /controlepunt140
tabulate() got multiple values for argument 'headers'
So how to improve this?
So if you remove the tabulate function. Then the format looks like this:
16 Watermeloenen: 466
360 Appels: 688
6 Sinaasappels: 803
75
9
688
22
80
160
320
160
61
So expected output is with headers:
header1 header2
------- -------
16 Watermeloenen: 466
360 Appels: 688
6 Sinaasappels: 803
75
9
688
22
80
160
320
160
61
Like how it works in tabulate.
You should be passing a single table to the tabulate() function, passing multiple lists results in the TypeError: tabulate() got multiple values for argument 'headers' you are seeing.
Updating your return statement -
def show_extracted_data_from_file():
regexes = [
verdi_total_number_fruit_regex(),
]
matches = [findallfruit(regex) for regex in regexes]
fruit_list = total_fruit_per_sort().split("\n")
return tabulate(zip_longest(*matches, fruit_list), headers=['header1','header2'])
Output:
header1 header2
--------- ------------------
16 Watermeloenen: 466
360 Appels: 688
6 Sinaasappels: 803
75
9
688
22
80
160
320
160
61
Related
I want to use stanza for tokenizing, pos tagging and parsing some text I have, but it keeps giving me this error. I've tried changing the way a I call it but nothing happens. Any ideas?
My code(Here a iterate through a list of list of text and appli stanza to each one)
t = time()
data_stanza = []
for text in data:
stz = apply_stanza(text[0])
data_stanza.append(stz)
print('Time to run: {} mins'.format(round((time() - t) / 60, 2)))
This is the function I use to apply_stanza to each text:
nlp = stanza.Pipeline('pt')
def apply_stanza(text):
doc = nlp(text)
All = []
for sent in doc.sentences:
for word in sent.words:
All.append((word.id,word.text,word.lemma,word.upos,word.feats,word.head,word.deprel))
return All
The error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-17-7ac303eec8e8> in <module>
3 data_staza = []
4 for text in data:
----> 5 stz = apply_stanza(text[0])
6 data_stanza.append(stz)
7
<ipython-input-16-364c3ac30f32> in apply_stanza(text)
2
3 def apply_stanza(text):
----> 4 doc = nlp(text)
5 All = []
6 for sent in doc.sentences:
~\anaconda3\lib\site-packages\stanza\pipeline\core.py in __call__(self, doc)
174 assert any([isinstance(doc, str), isinstance(doc, list),
175 isinstance(doc, Document)]), 'input should be either str, list or Document'
--> 176 doc = self.process(doc)
177 return doc
178
~\anaconda3\lib\site-packages\stanza\pipeline\core.py in process(self, doc)
168 for processor_name in PIPELINE_NAMES:
169 if self.processors.get(processor_name):
--> 170 doc = self.processors[processor_name].process(doc)
171 return doc
172
~\anaconda3\lib\site-packages\stanza\pipeline\mwt_processor.py in process(self, document)
31 preds = []
32 for i, b in enumerate(batch):
---> 33 preds += self.trainer.predict(b)
34
35 if self.config.get('ensemble_dict', False):
~\anaconda3\lib\site-packages\stanza\models\mwt\trainer.py in predict(self, batch, unsort)
77 self.model.eval()
78 batch_size = src.size(0)
---> 79 preds, _ = self.model.predict(src, src_mask, self.args['beam_size'])
80 pred_seqs = [self.vocab.unmap(ids) for ids in preds] # unmap to tokens
81 pred_seqs = utils.prune_decoded_seqs(pred_seqs)
~\anaconda3\lib\site-packages\stanza\models\common\seq2seq_model.py in predict(self, src, src_mask, pos, beam_size)
259 done = []
260 for b in range(batch_size):
--> 261 is_done = beam[b].advance(log_probs.data[b])
262 if is_done:
263 done += [b]
~\anaconda3\lib\site-packages\stanza\models\common\beam.py in advance(self, wordLk, copy_indices)
82 # bestScoresId is flattened beam x word array, so calculate which
83 # word and beam each score came from
---> 84 prevK = bestScoresId / numWords
85 self.prevKs.append(prevK)
86 self.nextYs.append(bestScoresId - prevK * numWords)
RuntimeError: Integer division of tensors using div or / is no longer supported, and in a future release div will perform
true division as in Python 3. Use true_divide or floor_divide (// in Python) instead.
ATT: It turns after all that it was and error with the mwt module of stanza pipeline, so I just specified not to use it.
Use // for division instead of /.
Try to edit your code as follows:
print('Time to run: {} mins'.format(round((time() - t) // 60, 2)))
Using floor division (//) will floor the result to the largest possible integer.
Using torch.true_divide(Dividend, Divisor) or numpy.true_divide(Dividend, Divisor) in stead.
For example: 3/4 = torch.true_divide(3, 4)
https://pytorch.org/docs/stable/generated/torch.true_divide.html
https://numpy.org/doc/stable/reference/generated/numpy.true_divide.html
I'm trying to extract dates from txt files using datefinder.find_dates which returns a generator object. Everything works fine until I try to convert the generator to list, when i get the following error.
I have been looking around for a solution but I can't figure out a solution to this, not sure I really understand the problem neither.
import datefinder
import glob
path = "some_path/*.txt"
files = glob.glob(path)
dates_dict = {}
for name in files:
with open(name, encoding='utf8') as f:
dates_dict[name] = list(datefinder.find_dates(f.read()))
Returns :
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-53-a4b508b01fe8> in <module>()
1 for name in files:
2 with open(name, encoding='utf8') as f:
----> 3 dates_dict[name] = list(datefinder.find_dates(f.read()))
C:\ProgramData\Anaconda3\lib\site-packages\datefinder\__init__.py in
find_dates(self, text, source, index, strict)
29 ):
30
---> 31 as_dt = self.parse_date_string(date_string, captures)
32 if as_dt is None:
33 ## Dateutil couldn't make heads or tails of it
C:\ProgramData\Anaconda3\lib\site-packages\datefinder\__init__.py in
parse_date_string(self, date_string, captures)
99 # otherwise self._find_and_replace method might corrupt
them
100 try:
--> 101 as_dt = parser.parse(date_string, default=self.base_date)
102 except ValueError:
103 # replace tokens that are problematic for dateutil
C:\ProgramData\Anaconda3\lib\site-packages\dateutil\parser\_parser.py in
parse(timestr, parserinfo, **kwargs)
1354 return parser(parserinfo).parse(timestr, **kwargs)
1355 else:
-> 1356 return DEFAULTPARSER.parse(timestr, **kwargs)
1357
1358
C:\ProgramData\Anaconda3\lib\site-packages\dateutil\parser\_parser.py in
parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
651 raise ValueError("String does not contain a date:",
timestr)
652
--> 653 ret = self._build_naive(res, default)
654
655 if not ignoretz:
C:\ProgramData\Anaconda3\lib\site-packages\dateutil\parser\_parser.py in
_build_naive(self, res, default)
1222 cday = default.day if res.day is None else res.day
1223
-> 1224 if cday > monthrange(cyear, cmonth)[1]:
1225 repl['day'] = monthrange(cyear, cmonth)[1]
1226
C:\ProgramData\Anaconda3\lib\calendar.py in monthrange(year, month)
122 if not 1 <= month <= 12:
123 raise IllegalMonthError(month)
--> 124 day1 = weekday(year, month, 1)
125 ndays = mdays[month] + (month == February and isleap(year))
126 return day1, ndays
C:\ProgramData\Anaconda3\lib\calendar.py in weekday(year, month, day)
114 """Return weekday (0-6 ~ Mon-Sun) for year (1970-...), month(1- 12),
115 day (1-31)."""
--> 116 return datetime.date(year, month, day).weekday()
117
118
OverflowError: Python int too large to convert to C long
Can someone explain this clearly?
Thanks in advance
REEDIT : After taking into consideration the remarks that were made, I found a minimal, readable and verifiable example. The error occurs on :
import datefinder
generator = datefinder.find_dates("466990103060049")
for s in generator:
pass
This looks to be a bug in the library you are using. It is trying to parse the string as a year, but that this year is too big to be handled by Python. The library that datefinder is using says that it raises an OverflowError in this instance, but that datefinder is ignoring this possibility.
One quick and dirty hack just to get it working would be to do:
>>> datefinder.ValueError = ValueError, OverflowError
>>> list(datefinder.find_dates("2019/02/01 is a date and 466990103060049 is not"))
[datetime.datetime(2019, 2, 1, 0, 0)]
I want to display all three lists side by side with the names associated with the values in a table format. I am manually doing it right now and it's taking a while for all 20 files I must do. Thank you for your help!
maxpreandpost = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in maxpreandpost:
height = max(i.Z)
print (height)
165.387
160.214
159.118
186.685
163.744
160.717
184.026
171.25099999999995
175.73
156.512
150.339
131.528
148.52100000000004
126.738
136.389
148.334
129.855
153.599
144.595
159.32299999999995
lenpreandpost = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in lenpreandpost:
duration = len(i.Z)
print (duration)
690
543
292
271
293
147
209
355
230
293
395
256
349
255
335
255
231
243
315
267
dis = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in dis:
p1 = [max(i.X),max(i.Y)]
p2 = [min(i.X),min(i.Y)]
distance = math.sqrt(((p1[0]-p2[0])**2)+((p1[1]-p2[1])**2))
print (distance)
2219.0546989150585
2337.434842606099
1857.1832474809803
1450.0472277998394
1512.6539831504758
1058.5635689541748
1653.517987682021
1854.670452561212
1861.8190476064021
1775.672511965326
1872.275393720069
1814.9932559772114
1852.3299779009246
1875.2281201398403
1867.1599096301313
1708.250531327712
1793.8521780715407
1862.7949271803914
1872.843665022548
1800.2239125453254
Sure, append all values to output lists and then add them to a pandas dataframe:
import pandas as pd
heightmax = []
maxpreandpost = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in maxpreandpost:
height = max(i.Z)
heightmax.append(height)
duration_pre_post = []
lenpreandpost = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in lenpreandpost:
duration = len(i.Z)
duration_pre_post.append(duration)
dis_p1_p2 = []
dis = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in dis:
p1 = [max(i.X),max(i.Y)]
p2 = [min(i.X),min(i.Y)]
distance = math.sqrt(((p1[0]-p2[0])**2)+((p1[1]-p2[1])**2))
dis_p1_p2.append(distance)
df = pd.DataFrame() # initialize empty dataframe
# Store each list as a column in the df.
df['HeightMax'] = heightmax
df['DurationPrePost'] = duration_pre_post
df['DistanceP1P2'] = dis_p1_p2
#if you want to write this out to a tabular file:
df.to_csv('./Desktop/myDf.csv', sep='\t', index=False)
The output of this would be something like:
HeightMax DurationPrePost DistanceP1P2
0 165.387 690 2219.0546989150585
1 160.214 543 2337.434842606099
2 159.118 292 1857.1832474809803
3 186.685 271 1450.0472277998394
4 163.744 293 1512.6539831504758
... #extends to end of lists
Data file looks like this:
350 1.06k
351 1.10k
352 1.15k
Required output:
350 1060
351 1100
352 1150
import re
data = ['350 1.06k', '351 1.10k', '352 1.15k']
def covert_string(var):
return float(''.join(re.findall(r'\d+?.?', var)).replace(' ',''))*1000
for value in data:
print covert_string(value)
## -- End pasted text --
3501060.0
3511100.0
3521150.0
I hope this solution works for you
This will help:
data = ['350 1.06k', '351 1.10k', '352 1.15k']
def convert(item):
item = item.split()
item[1] = str(float(item[1][:-1]) * 1000)[:-2]
return ' '.join(item)
res = [convert(i) for i in data]
print(res)
I googled for dealing with mongodb in python and the following came up
https://bitbucket.org/djcbeach/monary/wiki/Home
Based on this link monary suppose outperform pymongo, but the comparisons in this link were from queries, we want to see the difference in writing to mongodb (insert) so we made the following according to example code from link
from monary import Monary
from monary import monary_param as mp
import numpy as np
import time
NUM_BATCHES = 3500
BATCH_SIZE = 200
start = time.time()
types = ["float64"] * 5
fields = ["x1", "x2", "x3", "x4", "x5"]
global params
with Monary("127.0.0.1") as monary:
for i in xrange(NUM_BATCHES):
for l in xrange(BATCH_SIZE):
stuff = [ ]
for j in xrange(5):
record = dict(x1=random.uniform(0, 1),
x2=random.uniform(0, 2),
x3=random.uniform(0, 3),
x4=random.uniform(0, 4),
x5=random.uniform(0, 5)
)
stuff.append(record)
params = mp.MonaryParam.from_lists(np.array(stuff), fields)
monary.insert('mydb','collection',params)
end =time.time()
print 'Total time elapsed: %02d:%02d'% divmod((end - start), 60)
Why we keep getting errors?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-58-9ad6ec8c81e5> in <module>()
25 stuff.append(record)
26 #print len(np.array(stuff.append(record)))
---> 27 params = mp.MonaryParam.from_lists(np.array(stuff), fields)
28 monary.insert('mydb','collection',params)
29 # with Monary("127.0.0.1") as monary:
/Users/kelvin/anaconda/envs/gl-env/lib/python2.7/site-packages/monary/monary_param.pyc in from_lists(cls, data, fields, types)
77 raise ValueError(
78 "Data and fields must be of equal length.")
---> 79 return cls.from_groups(zip(data, fields))
80 else:
81 if not (len(data) == len(fields) == len(types)):
/Users/kelvin/anaconda/envs/gl-env/lib/python2.7/site-packages/monary/monary_param.pyc in from_groups(cls, groups)
93 - `groups`: List of items to be passed to MonaryParam.
94 """
---> 95 return list(map(lambda x: cls(x), groups))
96
97 def __len__(self):
/Users/kelvin/anaconda/envs/gl-env/lib/python2.7/site-packages/monary/monary_param.pyc in <lambda>(x)
93 - `groups`: List of items to be passed to MonaryParam.
94 """
---> 95 return list(map(lambda x: cls(x), groups))
96
97 def __len__(self):
/Users/kelvin/anaconda/envs/gl-env/lib/python2.7/site-packages/monary/monary_param.pyc in __init__(self, array, field, mtype)
39 if len(array) == 2:
40 array, field = array
---> 41 mtype = str(array.data.dtype)
42 else:
43 array, field, mtype = array
AttributeError: 'dict' object has no attribute 'data