I'm writing software which does some analysis of the input and returns a result. Part of the requirements includes it generates zero or more warnings or errors and includes those with the result. I'm also writing unit tests which, in particular, have some contrived data to verify the right warnings are emitted.
I need to be able to parse the warnings/errors and verify that the expected messages are correctly emitted. I figured I'd store the messages in a container and reference them with a message ID which is pretty similar to how I've done localization in the past.
errormessages.py right now looks pretty similar to:
from enum import IntEnum
NO_MESSAGE = ('')
HELLO = ('Hello, World')
GOODBYE = ('Goodbye')
class MsgId(IntEnum):
NO_MESSAGE = 0
HELLO = 1
GOODBYE = 2
Msg = {
MessageId.NO_MESSAGE: NO_MESSAGE,
MessageId.HELLO: HELLO,
MessageId.GOODBYE: GOODBYE,
}
So then the analysis can look similar to this:
from errormessages import Msg, MsgId
def analyse(_):
errors = []
errors.append(Msg[MsgId.HELLO])
return _, errors
And in the unit tests I can do something similar to
from errormessages import Msg, MsgId
from my import analyse
def test_hello():
_, errors = analyse('toy')
assert Msg[MsgId.HELLO] in errors
But some of the messages get formatted and I think that's going to play hell with parsing the messages for unit tests. I was thinking I'd add flavors of the messages; one for formatting and the other for parsing:
updated errormessages.py:
from enum import IntEnum
import re
FORMAT_NO_MESSAGE = ('')
FORMAT_HELLO = ('Hello, {}')
FORMAT_GOODBYE = ('Goodbye')
PARSE_NO_MESSAGE = re.compile(r'^$')
PARSE_HELLO = re.compile(r'^Hello, (.*)$')
PARSE_GOODBYE = re.compile('^Goodbye$')
class MsgId(IntEnum):
NO_MESSAGE = 0
HELLO = 1
GOODBYE = 2
Msg = {
MessageId.NO_MESSAGE: (FORMAT_NO_MESSAGE, PARSE_NO_MESSAGE),
MessageId.HELLO: (FORMAT_HELLO, PARSE_HELLO),
MessageId.GOODBYE: (FORMAT_GOODBYE, PARSE_GOODBYE),
}
So then the analysis can look like:
from errormessages import Msg, MsgId
def analyse(_):
errors = []
errors.append(Msg[MsgId.HELLO][0].format('World'))
return _, errors
And in the unit tests I can do:
from errormessages import Msg, MsgId
from my import analyse
import re
def test_hello():
_, errors = analyse('toy')
expected = {v: [] for v in MsgId}
expected[MsgId.HELLO] = [
Msg[MsgId.HELLO][1].match(msg)
for msg in errors
]
for _,v in expected.items():
if _ == MsgId.HELLO:
assert v
else:
assert not v
I was wondering if there's perhaps a better / simpler way? In particular, the messages are effectively repeated twice; once for the formatter and once for the regular expression. Is there a way to use a single string for both formatting and regular expression capturing?
Assuming the messages are all stored as format string templates (e.g. "Hello", or "Hello, {}" or "Hello, {firstname} {surname}"), then you could generate the regexes directly from the templates:
import re
import random
import string
def format_string_to_regex(format_string: str) -> re.Pattern:
"""Convert a format string template to a regex."""
unique_string = ''.join(random.choices(string.ascii_letters, k=24))
stripped_fields = re.sub(r"\{[^\{\}]*\}(?!\})", unique_string, format_string)
pattern = re.escape(stripped_fields).replace(unique_string, "(.*)")
pattern = pattern.replace("\{\{","\{").replace("\}\}", "\}")
return re.compile(f"^{pattern}$")
def is_error_message(error: str, expected_message: MessageId) -> bool:
"""Returns whether the error plausibly matches the MessageId."""
expected_format = format_string_to_regex(Msg[expected_message])
return bool(expected_format.match(error))
Related
I have wrriten a simple application using flask. Its main objective is to implement CLD2 (language detector) using post and get methods. It is working well for English but for any other language such Urdu, Arabic. It gives invalid results
Following is the corresponding script
# http://127.0.0.1:5000/cld2?text="Your input text string"
# OUTPUT ( It gives output as we done in CC)
#"585&URDU-99-1155"
from flask import Flask,abort,jsonify,request
from flask_restful import Resource, Api, reqparse
import cld2
from bs4 import BeautiflSoup
import sys
import urllib2, urllib
import re
reload(sys)
sys.setdefaultencoding('utf8')
app = Flask(__name__)
api = Api(app)
class HelloWorld(Resource):
def cld2_states(self, txt):
txt = txt.encode("utf8")
isReliable, textBytesFound, details = cld2.detect(txt)
outstr = str(textBytesFound)
for item in details: # Iterate 3 languages
if item[0] != "Unknown":
outstr += '&' + item[0] + '-' + str(item[2]) + '-' + str(int(item[3]))
return outstr
def get(self):
parser = reqparse.RequestParser()
parser.add_argument('text', type=str)
parser.add_argument('url', type=str)
_dict = dict(parser.parse_args())
if _dict["text"] is not None:
value = _dict["text"]
print type(value)
return self.cld2_states(value)
return None
def post(self):
data = request.get_json(force=True)
# print data
predict_request = [data['content']][1]
out = self.cld2_states(predict_request)
return jsonify(score=out)
api.add_resource(HelloWorld, '/cld2')
if __name__ == '__main__':
app.run(debug=True, port=6161, host='0.0.0.0')
If I give a query via get method, it give correct results but for same query in post method, it return just a number. But if text is in English then post also give correct result.
My client is a simple Java application then iterate over files and find their language one by one.
The problem might be with this line:
outstr = str(textBytesFound)
Instead of using str to convert from bytes to str, use str.decode(), like this:
outstr = textBytesFound.decode("utf-8")
(obviously if your text is not encoded with UTF-8, you need to tell Python the correct encoding to use)
I'm trying to parse a json file from an api call.
I have found this code that fits my need and trying to adapt it to what I want:
import math, urllib2, json, re
def download():
graph = {}
page = urllib2.urlopen("http://fx.priceonomics.com/v1/rates/?q=1")
jsrates = json.loads(page.read())
pattern = re.compile("([A-Z]{3})_([A-Z]{3})")
for key in jsrates:
matches = pattern.match(key)
conversion_rate = -math.log(float(jsrates[key]))
from_rate = matches.group(1).encode('ascii','ignore')
to_rate = matches.group(2).encode('ascii','ignore')
if from_rate != to_rate:
if from_rate not in graph:
graph[from_rate] = {}
graph[from_rate][to_rate] = float(conversion_rate)
return graph
And I've turned it into:
import math, urllib2, json, re
def download():
graph = {}
page = urllib2.urlopen("https://bittrex.com/api/v1.1/public/getmarketsummaries")
jsrates = json.loads(page.read())
for pattern in jsrates['result'][0]['MarketName']:
for key in jsrates['result'][0]['Ask']:
matches = pattern.match(key)
conversion_rate = -math.log(float(jsrates[key]))
from_rate = matches.group(1).encode('ascii','ignore')
to_rate = matches.group(2).encode('ascii','ignore')
if from_rate != to_rate:
if from_rate not in graph:
graph[from_rate] = {}
graph[from_rate][to_rate] = float(conversion_rate)
return graph
Now the problem is that there is multiple level in the json "Result > 0, 1,2 etc"
json screenshot
for key in jsrates['result'][0]['Ask']:
I want the zero to be able to be any number, I don't know if thats clear.
So I could get all the ask price to match their marketname.
I have shortened the code so it doesnt make too long of a post.
Thanks
PS: sorry for the english, its not my native language.
You could loop through all of the result values that are returned, ignoring the meaningless numeric index:
for result in jsrates['result'].values():
ask = result.get('Ask')
if ask is not None:
# Do things with your ask...
I am new to Python, and I want your advice on something.
I have a script that runs one input value at a time, and I want it to be able to run a whole list of such values without me typing the values one at a time. I have a hunch that a "for loop" is needed for the main method listed below. The value is "gene_name", so effectively, i want to feed in a list of "gene_names" that the script can run through nicely.
Hope I phrased the question correctly, thanks! The chunk in question seems to be
def get_probes_from_genes(gene_names)
import json
import urllib2
import os
import pandas as pd
api_url = "http://api.brain-map.org/api/v2/data/query.json"
def get_probes_from_genes(gene_names):
if not isinstance(gene_names,list):
gene_names = [gene_names]
#in case there are white spaces in gene names
gene_names = ["'%s'"%gene_name for gene_name in gene_names]**
api_query = "?criteria=model::Probe"
api_query= ",rma::criteria,[probe_type$eq'DNA']"
api_query= ",products[abbreviation$eq'HumanMA']"
api_query= ",gene[acronym$eq%s]"%(','.join(gene_names))
api_query= ",rma::options[only$eq'probes.id','name']"
data = json.load(urllib2.urlopen(api_url api_query))
d = {probe['id']: probe['name'] for probe in data['msg']}
if not d:
raise Exception("Could not find any probes for %s gene. Check " \
"http://help.brain- map.org/download/attachments/2818165/HBA_ISH_GeneList.pdf? version=1&modificationDate=1348783035873 " \
"for list of available genes."%gene_name)
return d
def get_expression_values_from_probe_ids(probe_ids):
if not isinstance(probe_ids,list):
probe_ids = [probe_ids]
#in case there are white spaces in gene names
probe_ids = ["'%s'"%probe_id for probe_id in probe_ids]
api_query = "? criteria=service::human_microarray_expression[probes$in%s]"% (','.join(probe_ids))
data = json.load(urllib2.urlopen(api_url api_query))
expression_values = [[float(expression_value) for expression_value in data["msg"]["probes"][i]["expression_level"]] for i in range(len(probe_ids))]
well_ids = [sample["sample"]["well"] for sample in data["msg"] ["samples"]]
donor_names = [sample["donor"]["name"] for sample in data["msg"] ["samples"]]
well_coordinates = [sample["sample"]["mri"] for sample in data["msg"] ["samples"]]
return expression_values, well_ids, well_coordinates, donor_names
def get_mni_coordinates_from_wells(well_ids):
package_directory = os.path.dirname(os.path.abspath(__file__))
frame = pd.read_csv(os.path.join(package_directory, "data", "corrected_mni_coordinates.csv"), header=0, index_col=0)
return list(frame.ix[well_ids].itertuples(index=False))
if __name__ == '__main__':
probes_dict = get_probes_from_genes("SLC6A2")
expression_values, well_ids, well_coordinates, donor_names = get_expression_values_from_probe_ids(probes_dict.keys())
print get_mni_coordinates_from_wells(well_ids)
whoa, first things first. Python ain't Java, so do yourself a favor and use a nice """xxx\nyyy""" string, with triple quotes to multiline.
api_query = """?criteria=model::Probe"
,rma::criteria,[probe_type$eq'DNA']
...
"""
or something like that. you will get white spaces as typed, so you may need to adjust.
If, like suggested, you opt to loop on the call to your function through a file, you will need to either try/except your data-not-found exception or you will need to handle missing data without throwing an exception. I would opt for returning an empty result myself and letting the caller worry about what to do with it.
If you do opt for raise-ing an Exception, create your own, rather than using a generic exception. That way your code can catch your expected Exception first.
class MyNoDataFoundException(Exception):
pass
#replace your current raise code with...
if not d:
raise MyNoDataFoundException(your message here)
clarification about catching exceptions, using the accepted answer as a starting point:
if __name__ == '__main__':
with open(r"/tmp/genes.txt","r") as f:
for line in f.readlines():
#keep track of your input data
search_data = line.strip()
try:
probes_dict = get_probes_from_genes(search_data)
except MyNoDataFoundException, e:
#and do whatever you feel you need to do here...
print "bummer about search_data:%s:\nexception:%s" % (search_data, e)
expression_values, well_ids, well_coordinates, donor_names = get_expression_values_from_probe_ids(probes_dict.keys())
print get_mni_coordinates_from_wells(well_ids)
You may want to create a file with Gene names, then read content of the file and call your function in the loop. Here is an example below
if __name__ == '__main__':
with open(r"/tmp/genes.txt","r") as f:
for line in f.readlines():
probes_dict = get_probes_from_genes(line.strip())
expression_values, well_ids, well_coordinates, donor_names = get_expression_values_from_probe_ids(probes_dict.keys())
print get_mni_coordinates_from_wells(well_ids)
I am working with images that have multiple layer which are described in their meta data that looks like this..
print layers
Cube1[visible:true, mode:Normal]{r:Cube1.R, g:Cube1.G, b:Cube1.B, a:Cube1.A}, Ground[visible:true, mode:Lighten, opacity:186]{r:Ground.R, g:Ground.G, b:Ground.B, a:Ground.A}, Cube3[visible:true, mode:Normal]{r:Cube3.R, g:Cube3.G, b:Cube3.B, a:Cube3.A}
I'm wondering if this formatting could be recognizable by Python as more then a string. Ideally I would like to call up the properties of any one for the layers. For example:
print layers[0].mode
"Normal"
On another post someone showed me how to get the names of each layer, which was very helpful, but now I'm looking to use the other info.
PS: if it helps I don't care about any of the info inside the {}
Thanks
print type(layers)
<type 'str'>"
In case you don't want to deal with regex ...
layers = "Cube1[visible:true, mode:Normal]{r:Cube1.R, g:Cube1.G, b:Cube1.B, a:Cube1.A}, Ground[visible:true, mode:Lighten, opacity:186]{r:Ground.R, g:Ground.G, b:Ground.B, a:Ground.A}, Cube3[visible:true, mode:Normal]{r:Cube3.R, g:Cube3.G, b:Cube3.B, a:Cube3.A}"
layer_dict = {}
parts = layers.split('}')
for part in parts:
part = part.strip(', ')
name_end = part.find('[')
if name_end < 1:
continue
name = part[:name_end]
attrs_end = part.find(']')
attrs = part[name_end+1:attrs_end].split(', ')
layer_dict[name] = {}
for attr in attrs:
attr_parts = attr.split(':')
layer_dict[name][attr_parts[0]] = attr_parts[1]
print 'Cube1 ... mode:', layer_dict.get('Cube1').get('mode')
print 'Ground ... opacity:', layer_dict.get('Ground').get('opacity')
print 'Cube3', layer_dict.get('Cube3')
output ...
Cube1 ... mode: Normal
Ground ... opacity: 186
Cube3 {'visible': 'true', 'mode': 'Normal'}
Parsing (Pyparsing et al) is surely the correct and extensible way to go, but here's a fast-and-dirty object and constructors using regexes and comprehensions to parse properties and bolt them on with setattr(). All constructive criticisms welcome!
import re
#import string
class Layer(object):
#classmethod
def make_list_from_string(cls,s):
all_layers_params = re.findall(r'(\w+)\[([^\]]+)\]',s)
return [cls(lname,largs) for (lname, largs) in all_layers_params]
def __init__(self,name,args):
self.name = name
for (larg,lval) in re.findall(r'(\w+):(\w+)(?:,\w*)?', args):
setattr(self,larg,lval)
def __str__(self):
return self.name + '[' + ','.join('%s:%s' % (k,v) for k,v in self.__dict__.iteritems() if k!='name') + ']'
def __repr__(self):
return self.__str__()
t = 'Cube1[visible:true, mode:Normal]{r:Cube1.R, g:Cube1.G, b:Cube1.B, a:Cube1.A}, Ground[visible:true, mode:Lighten, opacity:186]{r:Ground.R, g:Ground.G, b:Ground.B, a:Ground.A}, Cube3[visible:true, mode:Normal]{r:Cube3.R, g:Cube3.G, b:Cube3.B, a:Cube3.A}'
layers = Layer.make_list_from_string(t)
I moved all the imperative code into __init__() or the classmethod Layers.make_list_from_string().
Currently it stores all args as string, it doesn't figure opacity is int/float, but that's just an extra try...except block.
Hey, it does the job you wanted. And as a bonus it throws in mutability:
print layers[0].mode
'Normal'
print layers[1].opacity
'186'
print layers[2]
Cube3[visible:true,mode:Normal]
layers[0].mode = 'Weird'
print layers[0].mode
'Weird'
"I'm wondering if this formatting could be recognizable by Python as more then a string."
Alternatively, I was thinking if you tweaked the format a little, eval()/exec() could be used, but that's yukkier, slower and a security risk.
I'm trying to implement a WikiLink template filter in Django that queries the database model to give different responses depending on Page existence, identical to Wikipedia's red links. The filter does not raise an Error but instead doesn't do anything to the input.
WikiLink is defined as: [[ThisIsAWikiLink | This is the alt text]]
Here's a working example that does not query the database:
from django import template
from django.template.defaultfilters import stringfilter
from sites.wiki.models import Page
import re
register = template.Library()
#register.filter
#stringfilter
def wikilink(value):
return re.sub(r'\[\[ ?(.*?) ?\| ?(.*?) ?\]\]', r'\2', value)
wikilink.is_safe = True
The input (value) is a multi-line string, containing HTML and many WikiLinks.
The expected output is substituting [[ThisIsAWikiLink | This is the alt text]] with
This is the alt text
or if "ThisIsAWikiLink" doesn't exist in the database:
This is the alt text
and returning value.
Here's the non-working code (edited in response to comments/answers):
from django import template
from django.template.defaultfilters import stringfilter
from sites.wiki.models import Page
import re
register = template.Library()
#register.filter
#stringfilter
def wikilink(value):
m = re.match(r'\[\[ ?(.*?) ?\| ?(.*?) ?\]\]', value)
if(m):
page_alias = m.group(2)
page_title = m.group(3)
try:
page = Page.objects.get(alias=page_alias)
return re.sub(r'(\[\[)(.*)\|(.*)(\]\])', r'\3', value)
except Page.DoesNotExist:
return re.sub(r'(\[\[)(.*)\|(.*)(\]\])', r'\3', value)
else:
return value
wikilink.is_safe = True
What the code needs to do is:
extract all the WikiLinks in value
query the Page model to see if the page exists
substitute all the WikiLinks with normal links, styled dependent on each wikipage existence.
return the altered value
The updated question is:
What regular expression (method) can return a python List of WikiLinks, which can be altered and used to substitute the original matches (after being altered).
Edit:
I'd like to do something like this:
def wikilink(value):
regex = re.magic_method(r'\[\[ ?(.*?) ?\| ?(.*?) ?\]\]', value)
foreach wikilink in regex:
alias = wikilink.group(0)
text = wikilink.group(1)
if(alias exists in Page):
regex.sub(""+ text +"")
else:
regex.sub("<a href="+alias+" class='redlink'>"+ text +"</a>")
return value
If your string contains other text in addition to the wiki-link, your filter won't work because you are using re.match instead of re.search. re.match matches at the beginning of the string. re.search matches anywhere in the string. See matching vs. searching.
Also, your regex uses the greedy *, so it won't work if one line contains multiple wiki-links. Use *? instead to make it non-greedy:
re.search(r'\[\[(.*?)\|(.*?)\]\]', value)
Edit:
As for tips on how to fix your code, I suggest that you use re.sub with a callback. The advantages are:
It works correctly if you have multiple wiki-links in the same line.
One pass over the string is enough. You don't need a pass to find wiki-links, and another one to do the replacement.
Here is a sketch of the implmentation:
import re
WIKILINK_RE = re.compile(r'\[\[(.*?)\|(.*?)\]\]')
def wikilink(value):
def wikilink_sub_callback(match_obj):
alias = match_obj.group(1).strip()
text = match_obj.group(2).strip()
if(alias exists in Page):
class_attr = ''
else:
class_attr = ' class="redlink"'
return '<a href="%s"%s>%s</a>' % (alias, class_attr, text)
return WIKILINK_RE.sub(wikilink_sub_callback, value)
This is the type of problem that falls quickly to a small set of unit tests.
Pieces of the filter that can be tested in isolation (with a bit of code restructuring):
Determining whether or not value contains the pattern you're looking for
What string gets generated if there is a matching Page
What string gets generated is there isn't a matching Page
That would help you isolate where things are going wrong. You'll probably find that you'll need to rewire the regexps to account for optional spaces around the |.
Also, on first glance it looks like your filter is exploitable. You're claiming the result is safe, but you haven't filtered the alt text for nasties like script tags.
Code:
import re
def page_exists(alias):
if alias == 'ThisIsAWikiLink':
return True
return False
def wikilink(value):
if value == None:
return None
for alias, text in re.findall('\[\[\s*(.*?)\s*\|\s*(.*?)\s*\]\]',value):
if page_exists(alias):
value = re.sub('\[\[\s*%s\s*\|\s*%s\s*\]\]' % (alias,text), '%s' % (alias, text),value)
else:
value = re.sub('\[\[\s*%s\s*\|\s*%s\s*\]\]' % (alias,text), '%s' % (alias, text), value)
return value
Sample results:
>>> import wikilink
>>> wikilink.wikilink(None)
>>> wikilink.wikilink('')
''
>>> wikilink.wikilink('Test')
'Test'
>>> wikilink.wikilink('[[ThisIsAWikiLink | This is the alt text]]')
'This is the alt text'
>>> wikilink.wikilink('[[ThisIsABadWikiLink | This is the alt text]]')
'This is the alt text'
>>> wikilink.wikilink('[[ThisIsAWikiLink | This is the alt text]]\n[[ThisIsAWikiLink | This is another instance]]')
'This is the alt text\nThis is another instance'
>>> wikilink.wikilink('[[ThisIsAWikiLink | This is the alt text]]\n[[ThisIsAWikiLink | This is another instance]]')
General comments:
findall is the magic re function you're looking for
Change page_exists to run whatever query you want
Vulnerable to HTML injection (as mentioned by Dave W. Smith above)
Having to recompile the regex on each iteration is inefficient
Querying the database each time is inefficient
I think you'd run into performance issues pretty quickly with this approach.
This is the working code in case someone needs it:
from django import template
from django.template.defaultfilters import stringfilter
from sites.wiki.models import Page
import re
register = template.Library()
#register.filter
#stringfilter
def wikilink(value):
WIKILINK_RE = re.compile(r'\[\[ ?(.*?) ?\| ?(.*?) ?\]\]')
def wikilink_sub_callback(match_obj):
alias = match_obj.group(1).strip()
text = match_obj.group(2).strip()
class_attr = ''
try:
Page.objects.get(alias=alias)
except Page.DoesNotExist:
class_attr = ' class="redlink"'
return '<a href="%s"%s>%s</a>' % (alias, class_attr, text)
return WIKILINK_RE.sub(wikilink_sub_callback, value)
wikilink.is_safe = True
Many thanks for all the answers!