error when decoding:ANSI caracter †- python

i have a program in python 3 that read and compare files (that have the same name) in tow folder "gold" and "predcition"
but generate this error, my file are in UTF8 format so the caracter that generate the error is XE2 X80 (in ANSI it is â€) :
Traceback (most recent call last):
File "C:\scienceie2017_train\test.py", line 215, in <module>
calculateMeasures(folder_gold, folder_pred, remove_anno)
File "C:\scienceie2017_train\test.py", line 34, in calculateMeasures
res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
File "C:\scienceie2017_train\test.py", line 132, in normaliseAnnotations
for l in file_anno:
File "C:\Users\chedi\Anaconda3\lib\codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 915-916: invalid continuation byte
the code is:
#!/usr/bin/python
# by Mattew Peters, who spotted that sklearn does macro averaging not
# micro averaging correctly and changed it
import os
from sklearn.metrics import precision_recall_fscore_support
import sys
def calculateMeasures(folder_gold="data/dev/", folder_pred="data_pred/dev/", remove_anno=""):
'''
Calculate P, R, F1, Macro F
:param folder_gold: folder containing gold standard .ann files
:param folder_pred: folder containing prediction .ann files
:param remove_anno: if set if "rel", relations will be ignored. Use this setting to only evaluate
keyphrase boundary recognition and keyphrase classification. If set to "types", only keyphrase boundary recognition is evaluated.
Note that for the later, false positive
:return:
'''
flist_gold = os.listdir(folder_gold)
res_all_gold = []
res_all_pred = []
targets = []
for f in flist_gold:
# ignoring non-.ann files, should there
# be any
if not str(f).endswith(".ann"):
continue
f_gold = open(os.path.join(folder_gold, f), "r", encoding="utf")
try:
f_pred = open(os.path.join(folder_pred, f), "r", encoding="utf8")
res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
except IOError:
print(f + " file missing in " + folder_pred + ". Assuming no predictions are available for this file.")
res_full_pred, res_pred, spans_pred, rels_pred = [], [], [], []
res_full_gold, res_gold, spans_gold, rels_gold = normaliseAnnotations(f_gold, remove_anno)
spans_all = set(spans_gold + spans_pred)
for i, r in enumerate(spans_all):
if r in spans_gold:
target = res_gold[spans_gold.index(r)].split(" ")[0]
res_all_gold.append(target)
if not target in targets:
targets.append(target)
else:
res_all_gold.append("NONE")
if r in spans_pred:
target_pred = res_pred[spans_pred.index(r)].split(" ")[0]
res_all_pred.append(target_pred)
else:
res_all_pred.append("NONE")
#y_true, y_pred, labels, targets
prec, recall, f1, support = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average=None)
metrics = {}
for k, target in enumerate(targets):
metrics[target] = {
'precision': prec[k],
'recall': recall[k],
'f1-score': f1[k],
'support': support[k]
}
# now
# micro-averaged
if remove_anno != 'types':
prec, recall, f1, s = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average='micro')
metrics['overall'] = {
'precision': prec,
'recall': recall,
'f1-score': f1,
'support': sum(support)
}
else:
# just
# binary
# classification,
# nothing
# to
# average
metrics['overall'] = metrics['KEYPHRASE-NOTYPES']
print_report(metrics, targets)
return metrics
def print_report(metrics, targets, digits=2):
def _get_line(results, target, columns):
line = [target]
for column in columns[:-1]:
line.append("{0:0.{1}f}".format(results[column], digits))
line.append("%s" % results[columns[-1]])
return line
columns = ['precision', 'recall', 'f1-score', 'support']
fmt = '%11s' + '%9s' * 4 + '\n'
report = [fmt % tuple([''] + columns)]
report.append('\n')
for target in targets:
results = metrics[target]
line = _get_line(results, target, columns)
report.append(fmt % tuple(line))
report.append('\n')
# overall
line = _get_line(
metrics['overall'], 'avg / total', columns)
report.append(fmt % tuple(line))
report.append('\n')
print(''.join(report))
def normaliseAnnotations(file_anno, remove_anno):
'''
Parse annotations from the annotation files: remove relations (if requested), convert rel IDs to entity spans
:param file_anno:
:param remove_anno:
:return:
'''
res_full_anno = []
res_anno = []
spans_anno = []
rels_anno = []
for l in file_anno:
print(l)
print(l.strip('\n'))
r_g = l.strip('\n').split("\t")
print(r_g)
print(len(r_g))
r_g_offs = r_g[1].split()
print(r_g_offs)
if remove_anno != "" and r_g_offs[0].endswith("-of"):
continue
res_full_anno.append(l.strip())
if r_g_offs[0].endswith("-of"):
arg1 = r_g_offs[1].replace("Arg1:", "")
arg2 = r_g_offs[2].replace("Arg2:", "")
for l in res_full_anno:
r_g_tmp = l.strip().split("\t")
if r_g_tmp[0] == arg1:
ent1 = r_g_tmp[1].replace(" ", "_")
if r_g_tmp[0] == arg2:
ent2 = r_g_tmp[1].replace(" ", "_")
spans_anno.append(" ".join([ent1, ent2]))
res_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
rels_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
else:
spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]]))
keytype = r_g[1]
if remove_anno == "types":
keytype = "KEYPHRASE-NOTYPES"
res_anno.append(keytype)
for r in rels_anno:
r_offs = r.split(" ")
# reorder hyponyms to start with smallest index
# 1, 2
if r_offs[0] == "Synonym-of" and r_offs[2].split("_")[1] < r_offs[1].split("_")[1]:
r = " ".join([r_offs[0], r_offs[2], r_offs[1]])
if r_offs[0] == "Synonym-of":
for r2 in rels_anno:
r2_offs = r2.split(" ")
if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[1]:
r_new = " ".join([r2_offs[0], r_offs[2], r2_offs[2]])
rels_anno[rels_anno.index(r2)] = r_new
if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[2]:
r_new = " ".join([r2_offs[0], r2_offs[1], r_offs[2]])
rels_anno[rels_anno.index(r2)] = r_new
rels_anno = list(set(rels_anno))
res_full_anno_new = []
res_anno_new = []
spans_anno_new = []
for r in res_full_anno:
r_g = r.strip().split("\t")
if r_g[0].startswith("R") or r_g[0] == "*":
continue
ind = res_full_anno.index(r)
res_full_anno_new.append(r)
res_anno_new.append(res_anno[ind])
spans_anno_new.append(spans_anno[ind])
for r in rels_anno:
res_full_anno_new.append("R\t" + r)
res_anno_new.append(r)
spans_anno_new.append(" ".join([r.split(" ")[1], r.split(" ")[2]]))
return res_full_anno_new, res_anno_new, spans_anno_new, rels_anno
if __name__ == '__main__':
folder_gold = "data/dev/"
folder_pred = "data_pred/dev/"
remove_anno = "" # "", "rel" or "types"
if len(sys.argv) >= 2:
folder_gold = sys.argv[1]
if len(sys.argv) >= 3:
folder_pred = sys.argv[2]
if len(sys.argv) == 4:
remove_anno = sys.argv[3]
calculateMeasures(folder_gold, folder_pred, remove_anno)
example of prediction file
T1 Task 4 20 particular phase
T2 Task 4 26 particular phase field
T3 Task 15 26 phase field
T4 Task 15 32 phase field model
T5 Task 21 32 field model
T6 Task 93 118 dimensional thermal phase
T7 Task 105 118 thermal phase
T8 Task 105 124 thermal phase field
T9 Task 15 26 phase field
T10 Task 15 32 phase field model
T11 Task 21 32 field model
T12 Task 146 179 dimensional thermal-solutal phase
T13 Task 158 179 thermal-solutal phase
T14 Task 158 185 thermal-solutal phase field
T15 Task 15 26 phase field
T16 Task 15 32 phase field model
T17 Task 21 32 field model
T18 Task 219 235 physical problem
T19 Task 300 330 natural relaxational phenomena
T20 Task 308 330 relaxational phenomena
T21 Task 340 354 resulting PDEs
T22 Task 362 374 Allen–Cahn
T23 Task 383 403 Carn–Hilliard type
T24 Task 445 461 time derivatives
T25 Task 509 532 variational derivatives
T26 Task 541 554 functional â€
T27 Task 570 581 free energy
T28 Task 570 592 free energy functional
T29 Task 575 592 energy functional
T30 Task 570 581 free energy
T31 Task 702 717 domain boundary
T32 Task 780 797 difficult aspects
T33 Task 817 836 relaxational aspect
T34 Task 874 898 stable numerical schemes
T35 Task 881 898 numerical schemes
example of gold file
T1 Material 2 20 fluctuating vacuum
T2 Process 45 59 quantum fields
T3 Task 45 59 quantum fields
T4 Process 74 92 free Maxwell field
T5 Process 135 151 Fermionic fields
T6 Process 195 222 undergo vacuum fluctuations
T7 Process 257 272 Casimir effects
T8 Task 396 411 nuclear physics
T9 Task 434 464 “MIT bag model” of the nucleon
T10 Task 518 577 a collection of fermionic fields describing confined quarks
T11 Process 732 804 the bag boundary condition modifies the vacuum fluctuations of the field
T12 Task 983 998 nuclear physics
T13 Material 1063 1080 bag-model nucleon
T14 Material 507 514 nucleon
T15 Task 843 856 Casimir force
T16 Process 289 300 such fields

"–".encode("cp1256").decode("utf8") = –, an en dash.
The file you are opening appears to be encoded in UTF-8 and you are not specifying the encoding that should be used when open()ing them (just add encoding="utf8" to the parameters).
Python will use the operating system's default character encoding and you appear to be using Windows, where it's always something other than UTF-8. Take a look at
import locale
locale.getpreferredencoding()
to find out what encoding Python will use by default when reading and writing files.

Related

python-binance : How I can use time parameter to the function?

I tried to use this function but I got error.
I think need to change date format of time parameters.
now = datetime.now()
past = now - timedelta(days=2)
past = str(past)
bars = client.get_historical_klines("BTCUSDT", "1m", start_str = past, end_str = None, limit = 1000)
But I got error..
When I delete the start_str and end_str, it works.
How I can handle the date str for this function.
Could you help me?!(example is the best!)
---------ERROR------------
TypeError Traceback (most recent call last)
Input In [46], in <cell line: 1>()
----> 1 bars = client.get_historical_klines(symbol="BTCUSDT", interval="1m",
2 start_str=past, end_str=None, limit=1000)
File ~/opt/anaconda3/lib/python3.9/site-packages/binance/client.py:934, in Client.get_historical_klines(self, symbol, interval, start_str, end_str, limit, klines_type)
914 def get_historical_klines(self, symbol, interval, start_str=None, end_str=None, limit=1000,
915 klines_type: HistoricalKlinesType = HistoricalKlinesType.SPOT):
916 """Get Historical Klines from Binance
917
918 :param symbol: Name of symbol pair e.g BNBBTC
(...)
932
933 """
--> 934 return self._historical_klines(
935 symbol, interval, start_str=start_str, end_str=end_str, limit=limit, klines_type=klines_type
936 )
File ~/opt/anaconda3/lib/python3.9/site-packages/binance/client.py:969, in Client._historical_klines(self, symbol, interval, start_str, end_str, limit, klines_type)
966 timeframe = interval_to_milliseconds(interval)
968 # if a start time was passed convert it
--> 969 start_ts = convert_ts_str(start_str)
971 # establish first available start timestamp
972 if start_ts is not None:
File ~/opt/anaconda3/lib/python3.9/site-packages/binance/helpers.py:76, in convert_ts_str(ts_str)
74 if type(ts_str) == int:
75 return ts_str
---> 76 return date_to_milliseconds(ts_str)
File ~/opt/anaconda3/lib/python3.9/site-packages/binance/helpers.py:24, in date_to_milliseconds(date_str)
22 epoch: datetime = datetime.utcfromtimestamp(0).replace(tzinfo=pytz.utc)
23 # parse our date string
---> 24 d: Optional[datetime] = dateparser.parse(date_str, settings={'TIMEZONE': "UTC"})
25 if not d:
26 raise UnknownDateFormat(date_str)
File ~/opt/anaconda3/lib/python3.9/site-packages/dateparser/conf.py:92, in apply_settings.<locals>.wrapper(*args, **kwargs)
89 if not isinstance(kwargs['settings'], Settings):
90 raise TypeError("settings can only be either dict or instance of Settings class")
---> 92 return f(*args, **kwargs)
File ~/opt/anaconda3/lib/python3.9/site-packages/dateparser/__init__.py:61, in parse(date_string, date_formats, languages, locales, region, settings, detect_languages_function)
57 if languages or locales or region or detect_languages_function or not settings._default:
58 parser = DateDataParser(languages=languages, locales=locales,
59 region=region, settings=settings, detect_languages_function=detect_languages_function)
---> 61 data = parser.get_date_data(date_string, date_formats)
63 if data:
64 return data['date_obj']
File ~/opt/anaconda3/lib/python3.9/site-packages/dateparser/date.py:419, in DateDataParser.get_date_data(self, date_string, date_formats)
376 """
377 Parse string representing date and/or time in recognizable localized formats.
378 Supports parsing multiple languages and timezones.
(...)
416
417 """
418 if not isinstance(date_string, str):
--> 419 raise TypeError('Input type must be str')
421 res = parse_with_formats(date_string, date_formats or [], self._settings)
422 if res['date_obj']:
TypeError: Input type must be str
binance.client.Client.get_historical_klines() takes int or str as input value for start_str, see the documentation of this method:
def get_historical_klines(self, symbol, interval, start_str, end_str=None, limit=500,
klines_type: HistoricalKlinesType = HistoricalKlinesType.SPOT):
"""Get Historical Klines from Binance
:param symbol: Name of symbol pair e.g BNBBTC
:type symbol: str
:param interval: Binance Kline interval
:type interval: str
:param start_str: Start date string in UTC format or timestamp in milliseconds
:type start_str: str|int
:param end_str: optional - end date string in UTC format or timestamp in milliseconds (default will fetch everything up to now)
:type end_str: str|int
:param limit: Default 500; max 1000.
:type limit: int
:param klines_type: Historical klines type: SPOT or FUTURES
:type klines_type: HistoricalKlinesType
:return: list of OHLCV values
"""
return self._historical_klines(symbol, interval, start_str, end_str=end_str, limit=limit, klines_type=klines_type)
You're trying to pass an datetime.datetime object.
You can convert this object to a timestamp (ms) with:
import datetime as dt
now = dt.datetime.now(dt.timezone.utc)
past = now - dt.timedelta(days=2)
# Gives you a timestamp in ms
past_timestamp_ms = int(round(past.timestamp() * 1000, 0))

STANZA/ RuntimeError: Integer division of tensors using div or / is no longer supported

I want to use stanza for tokenizing, pos tagging and parsing some text I have, but it keeps giving me this error. I've tried changing the way a I call it but nothing happens. Any ideas?
My code(Here a iterate through a list of list of text and appli stanza to each one)
t = time()
data_stanza = []
for text in data:
stz = apply_stanza(text[0])
data_stanza.append(stz)
print('Time to run: {} mins'.format(round((time() - t) / 60, 2)))
This is the function I use to apply_stanza to each text:
nlp = stanza.Pipeline('pt')
def apply_stanza(text):
doc = nlp(text)
All = []
for sent in doc.sentences:
for word in sent.words:
All.append((word.id,word.text,word.lemma,word.upos,word.feats,word.head,word.deprel))
return All
The error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-17-7ac303eec8e8> in <module>
3 data_staza = []
4 for text in data:
----> 5 stz = apply_stanza(text[0])
6 data_stanza.append(stz)
7
<ipython-input-16-364c3ac30f32> in apply_stanza(text)
2
3 def apply_stanza(text):
----> 4 doc = nlp(text)
5 All = []
6 for sent in doc.sentences:
~\anaconda3\lib\site-packages\stanza\pipeline\core.py in __call__(self, doc)
174 assert any([isinstance(doc, str), isinstance(doc, list),
175 isinstance(doc, Document)]), 'input should be either str, list or Document'
--> 176 doc = self.process(doc)
177 return doc
178
~\anaconda3\lib\site-packages\stanza\pipeline\core.py in process(self, doc)
168 for processor_name in PIPELINE_NAMES:
169 if self.processors.get(processor_name):
--> 170 doc = self.processors[processor_name].process(doc)
171 return doc
172
~\anaconda3\lib\site-packages\stanza\pipeline\mwt_processor.py in process(self, document)
31 preds = []
32 for i, b in enumerate(batch):
---> 33 preds += self.trainer.predict(b)
34
35 if self.config.get('ensemble_dict', False):
~\anaconda3\lib\site-packages\stanza\models\mwt\trainer.py in predict(self, batch, unsort)
77 self.model.eval()
78 batch_size = src.size(0)
---> 79 preds, _ = self.model.predict(src, src_mask, self.args['beam_size'])
80 pred_seqs = [self.vocab.unmap(ids) for ids in preds] # unmap to tokens
81 pred_seqs = utils.prune_decoded_seqs(pred_seqs)
~\anaconda3\lib\site-packages\stanza\models\common\seq2seq_model.py in predict(self, src, src_mask, pos, beam_size)
259 done = []
260 for b in range(batch_size):
--> 261 is_done = beam[b].advance(log_probs.data[b])
262 if is_done:
263 done += [b]
~\anaconda3\lib\site-packages\stanza\models\common\beam.py in advance(self, wordLk, copy_indices)
82 # bestScoresId is flattened beam x word array, so calculate which
83 # word and beam each score came from
---> 84 prevK = bestScoresId / numWords
85 self.prevKs.append(prevK)
86 self.nextYs.append(bestScoresId - prevK * numWords)
RuntimeError: Integer division of tensors using div or / is no longer supported, and in a future release div will perform
true division as in Python 3. Use true_divide or floor_divide (// in Python) instead.
ATT: It turns after all that it was and error with the mwt module of stanza pipeline, so I just specified not to use it.
Use // for division instead of /.
Try to edit your code as follows:
print('Time to run: {} mins'.format(round((time() - t) // 60, 2)))
Using floor division (//) will floor the result to the largest possible integer.
Using torch.true_divide(Dividend, Divisor) or numpy.true_divide(Dividend, Divisor) in stead.
For example: 3/4 = torch.true_divide(3, 4)
https://pytorch.org/docs/stable/generated/torch.true_divide.html
https://numpy.org/doc/stable/reference/generated/numpy.true_divide.html

Displaying results from 3 different For loops into a table

I want to display all three lists side by side with the names associated with the values in a table format. I am manually doing it right now and it's taking a while for all 20 files I must do. Thank you for your help!
maxpreandpost = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in maxpreandpost:
height = max(i.Z)
print (height)
165.387
160.214
159.118
186.685
163.744
160.717
184.026
171.25099999999995
175.73
156.512
150.339
131.528
148.52100000000004
126.738
136.389
148.334
129.855
153.599
144.595
159.32299999999995
lenpreandpost = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in lenpreandpost:
duration = len(i.Z)
print (duration)
690
543
292
271
293
147
209
355
230
293
395
256
349
255
335
255
231
243
315
267
dis = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in dis:
p1 = [max(i.X),max(i.Y)]
p2 = [min(i.X),min(i.Y)]
distance = math.sqrt(((p1[0]-p2[0])**2)+((p1[1]-p2[1])**2))
print (distance)
2219.0546989150585
2337.434842606099
1857.1832474809803
1450.0472277998394
1512.6539831504758
1058.5635689541748
1653.517987682021
1854.670452561212
1861.8190476064021
1775.672511965326
1872.275393720069
1814.9932559772114
1852.3299779009246
1875.2281201398403
1867.1599096301313
1708.250531327712
1793.8521780715407
1862.7949271803914
1872.843665022548
1800.2239125453254
Sure, append all values to output lists and then add them to a pandas dataframe:
import pandas as pd
heightmax = []
maxpreandpost = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in maxpreandpost:
height = max(i.Z)
heightmax.append(height)
duration_pre_post = []
lenpreandpost = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in lenpreandpost:
duration = len(i.Z)
duration_pre_post.append(duration)
dis_p1_p2 = []
dis = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in dis:
p1 = [max(i.X),max(i.Y)]
p2 = [min(i.X),min(i.Y)]
distance = math.sqrt(((p1[0]-p2[0])**2)+((p1[1]-p2[1])**2))
dis_p1_p2.append(distance)
df = pd.DataFrame() # initialize empty dataframe
# Store each list as a column in the df.
df['HeightMax'] = heightmax
df['DurationPrePost'] = duration_pre_post
df['DistanceP1P2'] = dis_p1_p2
#if you want to write this out to a tabular file:
df.to_csv('./Desktop/myDf.csv', sep='\t', index=False)
The output of this would be something like:
HeightMax DurationPrePost DistanceP1P2
0 165.387 690 2219.0546989150585
1 160.214 543 2337.434842606099
2 159.118 292 1857.1832474809803
3 186.685 271 1450.0472277998394
4 163.744 293 1512.6539831504758
... #extends to end of lists

comparing monary with pymongo for accessing mongodb

I googled for dealing with mongodb in python and the following came up
https://bitbucket.org/djcbeach/monary/wiki/Home
Based on this link monary suppose outperform pymongo, but the comparisons in this link were from queries, we want to see the difference in writing to mongodb (insert) so we made the following according to example code from link
from monary import Monary
from monary import monary_param as mp
import numpy as np
import time
NUM_BATCHES = 3500
BATCH_SIZE = 200
start = time.time()
types = ["float64"] * 5
fields = ["x1", "x2", "x3", "x4", "x5"]
global params
with Monary("127.0.0.1") as monary:
for i in xrange(NUM_BATCHES):
for l in xrange(BATCH_SIZE):
stuff = [ ]
for j in xrange(5):
record = dict(x1=random.uniform(0, 1),
x2=random.uniform(0, 2),
x3=random.uniform(0, 3),
x4=random.uniform(0, 4),
x5=random.uniform(0, 5)
)
stuff.append(record)
params = mp.MonaryParam.from_lists(np.array(stuff), fields)
monary.insert('mydb','collection',params)
end =time.time()
print 'Total time elapsed: %02d:%02d'% divmod((end - start), 60)
Why we keep getting errors?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-58-9ad6ec8c81e5> in <module>()
25 stuff.append(record)
26 #print len(np.array(stuff.append(record)))
---> 27 params = mp.MonaryParam.from_lists(np.array(stuff), fields)
28 monary.insert('mydb','collection',params)
29 # with Monary("127.0.0.1") as monary:
/Users/kelvin/anaconda/envs/gl-env/lib/python2.7/site-packages/monary/monary_param.pyc in from_lists(cls, data, fields, types)
77 raise ValueError(
78 "Data and fields must be of equal length.")
---> 79 return cls.from_groups(zip(data, fields))
80 else:
81 if not (len(data) == len(fields) == len(types)):
/Users/kelvin/anaconda/envs/gl-env/lib/python2.7/site-packages/monary/monary_param.pyc in from_groups(cls, groups)
93 - `groups`: List of items to be passed to MonaryParam.
94 """
---> 95 return list(map(lambda x: cls(x), groups))
96
97 def __len__(self):
/Users/kelvin/anaconda/envs/gl-env/lib/python2.7/site-packages/monary/monary_param.pyc in <lambda>(x)
93 - `groups`: List of items to be passed to MonaryParam.
94 """
---> 95 return list(map(lambda x: cls(x), groups))
96
97 def __len__(self):
/Users/kelvin/anaconda/envs/gl-env/lib/python2.7/site-packages/monary/monary_param.pyc in __init__(self, array, field, mtype)
39 if len(array) == 2:
40 array, field = array
---> 41 mtype = str(array.data.dtype)
42 else:
43 array, field, mtype = array
AttributeError: 'dict' object has no attribute 'data

get best line when coordinates overlap

I have a dataset like this:
FBti0018875 2031 2045 - TTCCAGAAACTGTTG hsf 62 0.9763443383736672
FBti0018875 2038 2052 + TTCTGGAATGGCAAC hsf 61 0.96581136371138
FBti0018925 2628 2642 - GACTGGAACTTTTCA hsf 60 0.9532992561656318
FBti0018925 2828 2842 + AGCTGGAAGCTTCTT hsf 63 0.9657036377575696
FBti0018949 184 198 - TTCGAGAAACTTAAC hsf 61 0.965931072979605
FBti0018986 2036 2050 + TTCTAGCATATTCTT hsf 63 0.9943559469645551
FBti0018993 1207 1221 - TTCTAGCATATTCTT hsf 63 0.9943559469645575
FBti0018996 2039 2053 + TTCTAGCATATTCTT hsf 63 0.9943559469645575
FBti0019092 2985 2999 - TTCTAGCATATTCTT hsf 63 0.9943559469645409
FBti0019118 1257 1271 + TTCCAGAATCTTGGA hsf 60 0.9523907773798134
The first column is an identifier, and the second and third are coordinates. I only want to keep one line for each range of coordinates. Meaning that I want to keep the best identifier if there is an overlap for it (the best is defined based on the last column, higher value = better).
For example for identifier FBti0018875 I would keep the first one because a) there is overlap with the second line and b) its last column value is higher (0.97>0.96).
If there was not an overlap between the first and second line I would keep both. Sometimes I can have 5 or 6 lines for each identifier, so it's not as simple as comparing the current one with the previous.
So far I have this code that doesn't work.
def test(lista, listb): #Compare lists of coordinates
a = 0
b = 0
found = False
while a<len(lista) and b<len(listb):
result = check( lista[a] , listb[b] )
if result < 0:
a += 1
continue
if result > 0:
b += 1
continue
# we found overlapping intervals
found = True
return (found, a, lista[a], b, listb[b] )
return found
def check( (astart, aend) , (bstart, bend) ):
if aend < bstart:
return -1
if bend < astart:
return 1
return 0
refine = open("tffm/tffm_te_hits95.txt", "r")
refined = open("tffm/tffm_te_unique_hits95.txt", "w")
current_TE=[]
for hit in refine:
info=hit.rstrip().split('\t')
if len(current_TE)==0 or info[0]==current_TE[0][0]:
current_TE.append(info)
elif info[0]!=current_TE[0][0]:
to_keep=[]
i=0
if len(current_TE)==1:
to_keep.append(0)
else:
for i in range(len(current_TE)-1):
if [current_TE[i][1], current_TE[i][2]] == [current_TE[i+1][1], current_TE[i+1][2]]:
if current_TE[i][7]<current_TE[i+1][7]:
to_keep.append(i+1)
elif test([(current_TE[i][1], current_TE[i][2])], [(current_TE[i+1][1], current_TE[i+1][2])])!='False':
if current_TE[i][7]<current_TE[i+1][7]:
to_keep.append(i+1)
try:
to_keep.remove(i)
except:
pass
else:
to_keep.append(i)
else:
to_keep.append(i)
if i==len(current_TE)-1:
to_keep.append(i+1)
for item in set(to_keep):
print current_TE[item]
current_TE=[]
The expected outcome in this case would be (only losing one FBti0018875)
FBti0018875 2031 2045 - TTCCAGAAACTGTTG hsf 62 0.9763443383736672
FBti0018925 2628 2642 - GACTGGAACTTTTCA hsf 60 0.9532992561656318
FBti0018925 2828 2842 + AGCTGGAAGCTTCTT hsf 63 0.9657036377575696
FBti0018949 184 198 - TTCGAGAAACTTAAC hsf 61 0.965931072979605
FBti0018986 2036 2050 + TTCTAGCATATTCTT hsf 63 0.9943559469645551
FBti0018993 1207 1221 - TTCTAGCATATTCTT hsf 63 0.9943559469645575
FBti0018996 2039 2053 + TTCTAGCATATTCTT hsf 63 0.9943559469645575
FBti0019092 2985 2999 - TTCTAGCATATTCTT hsf 63 0.9943559469645409
FBti0019118 1257 1271 + TTCCAGAATCTTGGA hsf 60 0.9523907773798134
I have tried (with the code) to generate a list containing several lines with the same identifier and then parse it for the ones with overlapping coordinates and if that was the case select one according to the last column. It succeeds in checking the overlap but I only retrieve a handful of lines in some versions of it or:
Traceback (most recent call last):
File "<stdin>", line 29, in <module>
IndexError: list index out of range
Finally I solved. There was a silly mistake with 'False' instead of False.
Here is the solution:
def test(lista, listb):
a = 0
b = 0
found = False
while a<len(lista) and b<len(listb):
result = check( lista[a] , listb[b] )
if result < 0:
a += 1
continue
if result > 0:
b += 1
continue
# we found overlapping intervals
found = True
return (found, a, lista[a], b, listb[b] )
return found
def check( (astart, aend) , (bstart, bend) ):
if aend < bstart:
return -1
if bend < astart:
return 1
return 0
def get_unique_sre(current_TE):
to_keep = range(0,len(current_TE))
for i in range(len(current_TE)-1):
if [current_TE[i][1], current_TE[i][2]] == [current_TE[i+1][1], current_TE[i+1][2]]:
if current_TE[i][7]<current_TE[i+1][7]:
try:
to_keep.remove(i)
except:
pass
elif test([(current_TE[i][1], current_TE[i][2])], [(current_TE[i+1][1], current_TE[i+1][2])])!=False:
if current_TE[i][7]<current_TE[i+1][7]:
try:
to_keep.remove(i)
except:
pass
else:
to_keep.remove(i+1)
final_TE=[]
for i in to_keep:
final_TE.append(current_TE[i])
return final_TE
refine = open("tffm/tffm_te_hits95.txt", "r")
refined = open("tffm/tffm_te_unique_hits95.txt", "w")
current_TE=[]
for hit in refine:
info=hit.rstrip().split('\t')
if len(current_TE)==0 or info[0]==current_TE[0][0]:
current_TE.append(info)
else:
if len(current_TE)==1:
print>>refined, current_TE[0]
current_TE=[]
else:
final_TE = get_unique_sre(current_TE)
for item in final_TE:
print>>refined, item
current_TE=[]
refined.close()

Categories

Resources