How can I randomly generate strings with this CFG? - python

I have this code describing a context-free grammar and I'm trying to generate random strings that match it; for example, like this:
"John thinks that Mary hates every green cat"
But my current output is:
[['_S', ['_NP _VP']], ['_NP', ['_Det _Adj _N', '_Det _N', '_Adj _PropN', '_PropN']], ['_VP', ['_Vi', '_Vt _NP', '_Vc _Comp _S']]]
[['_Det', ['the', 'a', 'some', 'any', 'every']], ['_Adj', ['green', 'young', 'tired', 'confused']], ['_N', ['dog', 'cat']], ['_PropN', ['John', 'Mary']], ['_Vi', ['sleeps', 'walks']], ['_Vt', ['loves', 'hates']], ['_Vc', ['says', 'thinks', 'believes']], ['_Comp', ['that']]]
please help!
import random
psg_rules_str = "S → NP VP\n" \
"NP → Det Adj N | Det N | Adj PropN | PropN\n" \
"VP → Vi | Vt NP | Vc Comp S"
terminals_str = "Det → the | a | some | any | every\n" \
"Adj → green | young | tired | confused\n" \
"N → dog | cat\n" \
"PropN → John | Mary\n" \
"Vi → sleeps | walks\n" \
"Vt → loves | hates\n" \
"Vc → says | thinks | believes\n" \
"Comp → that"
psg_rules_list = [a.split("→") for a in psg_rules_str.split("\n")]
for p in psg_rules_list:
p[0] = "_" + p[0].strip()
p[1] = p[1].split("|")
p[1] = ["_" + a.strip().replace(" ", " _") for a in p[1]]
print(psg_rules_list)
# [['_S', ['_NP _VP']], ['_NP', ['_Det _Adj _N', '_Det _N', '_Adj _PropN', '_PropN']], ['_VP', ['_Vi', '_Vt _NP', '_Vc _Comp _S']]]
terminals_list = [a.split("→") for a in terminals_str.split("\n")]
for t in terminals_list:
t[0] = "_" + t[0].strip()
t[1] = t[1].split("|")
t[1] = [a.strip() for a in t[1]]
print(terminals_list)
# [['_Det', ['the', 'a', 'some', 'any', 'every']], ['_Adj', ['green', 'young', 'tired', 'confused']], ['_N', ['dog', 'cat']], ['_PropN', ['John', 'Mary']], ['_Vi', ['sleeps', 'walks']], ['_Vt', ['loves', 'hates']], ['_Vc', ['says', 'thinks', 'believes']], ['_Comp', ['that']]]
def reachTerminals(from_nts, with_rules, with_ts):
from_nts = str.upper("_" + from_nts.replace("_", "").strip().replace(" ", " _"))
rule_tags = [a[0] for a in with_rules]
ts_tags = [a[0] for a in with_ts]
nts_todo = [a for a in rule_tags if a in from_nts]
while nts_todo != list():
tag = nts_todo[0]
wr_index = rule_tags.index(tag)
repl_choices = with_rules[wr_index][1]
nts_todo = [a for a in rule_tags if a in from_nts]
sentence = reachTerminals(from_nts="s", with_rules=psg_rules_list, with_ts=terminals_list)

You nearly have the program working. Here's a way to complete the reachTerminals function:
import random
psg_rules_str = "S → NP VP\n" \
"NP → Det Adj N | Det N | Adj PropN | PropN\n" \
"VP → Vi | Vt NP | Vc Comp S"
terminals_str = "Det → the | a | some | any | every\n" \
"Adj → green | young | tired | confused\n" \
"N → dog | cat\n" \
"PropN → John | Mary\n" \
"Vi → sleeps | walks\n" \
"Vt → loves | hates\n" \
"Vc → says | thinks | believes\n" \
"Comp → that"
psg_rules_list = [a.split("→") for a in psg_rules_str.split("\n")]
for p in psg_rules_list:
p[0] = "_" + p[0].strip()
p[1] = p[1].split("|")
p[1] = ["_" + a.strip().replace(" ", " _") for a in p[1]]
terminals_list = [a.split("→") for a in terminals_str.split("\n")]
for t in terminals_list:
t[0] = "_" + t[0].strip()
t[1] = t[1].split("|")
t[1] = [a.strip() for a in t[1]]
def reachTerminals(from_nts, with_rules, with_ts):
from_nts = str.upper("_" + from_nts.replace("_", "").strip().replace(" ", " _"))
rule_tags = [a[0] for a in with_rules]
ts_tags = [a[0] for a in with_ts]
nts_todo = [a for a in rule_tags if a in from_nts]
while nts_todo:
for tag in nts_todo:
wr_index = rule_tags.index(tag)
repl_choices = with_rules[wr_index][1]
choice = random.choice(repl_choices)
from_nts = from_nts.replace(tag, choice, 1)
nts_todo = [a for a in rule_tags if a in from_nts]
ts_todo = [a for a in ts_tags if a in from_nts]
while ts_todo:
for tag in ts_todo:
wr_index = ts_tags.index(tag)
repl_choices = with_ts[wr_index][1]
choice = random.choice(repl_choices)
from_nts = from_nts.replace(tag, choice, 1)
ts_todo = [a for a in ts_tags if a in from_nts]
return from_nts
print(reachTerminals(from_nts = "s", with_rules = psg_rules_list, with_ts = terminals_list))
The important tools for you to use are the random.choice function and the str.replace function's third parameter, which lets you only replace the first occurrence of the substring. I haven't thoroughly tested the code, but it seems to be working as expected. Example outputs:
green John loves some confused dog
Mary says that the tired dog says that some green cat hates some cat
every green dog loves young John
John loves the tired cat

Related

How to random swap CFG grammar?

I want to randomly swap the grammar I defined for CFG. For example I have this grammar:
grammar = CFG.fromstring(f"""
S -> NP VP | VP NP
NP -> DET N| ADJ N
VP -> V NP | V ADJ | V ADV
V -> {VERB}
N -> {NOUN}
ADJ ->{ADJ}
DET -> {DET}
ADV -> {ADV}
""")
I want to randomly swap S -> NP VP | VP NP to S -> VP NP | NP VP in each iteration and in each iteration I'll be generate only one sentence.
import spacy
import codecs
import nltk
from nltk.parse.generate import generate,demo_grammar
import random
from nltk import Nonterminal
import substitute
from nltk.grammar import CFG
nlp = spacy.load("en_core_web_sm")
fin = [
"Wow!The movie was a complete joy to watch, with an incredible cast delivering fantastic performances. The special effects were stunning. I highly recommend this movie to everyone."
]
for line in fin:
sent = line #Stripping the dataset based on tab. That is stripping label from sentence
words = [x.lower() for x in nltk.word_tokenize(sent)] #lowering the sentence and tokenizing
sent = ' '.join(words)
text = sent
doc = nlp(text)
noun = []
verb = []
adj = []
det = []
adv = []
pq = []
pp = []
intj = []
part = []
not_or_no = ""
# Token and Tag
for token in doc:
if token.text=="not" or token.text=="no":
print("found")
not_or_no = token.text
continue
# print(token.pos_)
if token.pos_ == "NOUN" or token.pos_ == "PROPN" or token.pos_ == "PRON":
noun.append(not_or_no + " " + token.text)
elif token.pos_ == "VERB" or token.pos_ == "AUX":
verb.append(not_or_no + " " + token.text)
elif token.pos_ == "DET":
det.append(token)
elif token.pos_ == "ADJ":
adj.append(not_or_no + " " + token.text)
elif token.pos_ == "ADV":
adv.append(not_or_no + " " + token.text)
not_or_no = ""
NOUN = ''
VERB = ' '
DET = ''
ADV = ''
ADJ = ' '
CONJ = ''
ADP = ''
for i in range(0,len(noun)):
NOUN += '"' + str(noun[i]) + '"'
if i != len(noun)-1 and len(noun) != 1:
NOUN += ' | '
for i in range(0,len(verb)):
VERB += '"' + str(verb[i])+ '"'
if i != len(verb) - 1 and len(verb) != 1 :
VERB += ' | '
for i in range(0,len(det)):
DET += '"'+ str(det[i])+ '"'
if i != len(det) - 1 and len(det) != 1:
DET += ' | '
for i in range(0,len(adj)):
ADJ += '"'+ str(adj[i])+ '"'
if i != len(adj) - 1 and len(adj) != 1:
ADJ += ' | '
for i in range(0,len(adv)):
ADV += '"'+ str(adv[i])+ '"'
if i != len(adv) - 1 and len(adv) != 1:
ADV += ' | '
grammar = CFG.fromstring(f"""
S -> NP VP | VP NP
NP -> DET N| ADJ N
VP -> V NP | V ADJ | V ADV
V -> {VERB}
N -> {NOUN}
ADJ ->{ADJ}
DET -> {DET}
ADV -> {ADV}
""")
for i in range(50):
# Swapping the `S` rule randomly
if random.choice([True, False]):
new_grammar = grammar.substitute(NP=grammar.productions()[0].rhs(), S=grammar.productions()[1].rhs())
else:
new_grammar = grammar.substitute(VP=grammar.productions()[0].rhs(), S=grammar.productions()[1].rhs())
for sentence in generate(new_grammar, n=1):
new_sent = ' '.join(sentence)
print(new_sent)
Traceback:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-26-81e9ddd4e9ec>", line 7, in <module>
import substitute
File "/usr/local/lib/python3.8/dist-packages/substitute/__init__.py", line 1, in <module>
from .substitute import *
File "/usr/local/lib/python3.8/dist-packages/substitute/substitute.py", line 126
equals = lambda (a,b):a==b
^
SyntaxError: invalid syntax
Here I will randomly swap each Nonterminal and generate a sentence of the randomly swapped grammar structure. But how can I randomly swap those nonterminal like S, NP, VP
Try this:
import random
from nltk.grammar import CFG
grammar = CFG.fromstring(f"""
S -> NP VP | VP NP
NP -> DET N| ADJ N
VP -> V NP | V ADJ | V ADV
V -> {VERB}
N -> {NOUN}
ADJ ->{ADJ}
DET -> {DET}
ADV -> {ADV}
""")
for i in range(50):
# Swapping the `S` rule randomly
if random.choice([True, False]):
new_grammar = grammar.substitute(NP=grammar.productions()[0].rhs(), S=grammar.productions()[1].rhs())
else:
new_grammar = grammar.substitute(VP=grammar.productions()[0].rhs(), S=grammar.productions()[1].rhs())
for sentence in generate(new_grammar, n=1):
new_sent = ' '.join(sentence)
print(new_sent)
Here substitute method of the CFG class is used to create a new grammar with a nonterminal rule. random.choice([True, False]) is used to give either True or False so that we can randomly select the swap we want to make.
Edit
Try this:
for i in range(50):
if random.choice([True, False]):
new_grammar = CFG.fromstring(f"""
S -> VP NP | NP VP
NP -> DET N | ADJ N
VP -> V NP | V ADJ | V ADV
V -> {VERB}
N -> {NOUN}
ADJ -> {ADJ}
DET -> {DET}
ADV -> {ADV}
""")
else:
new_grammar = grammar
for sentence in generate(new_grammar, n=1):
new_sent = ' '.join(sentence)
print(new_sent)

Parse ascii table header

So I need to parse this into dataframe or list:
tmp =
['+--------------+-----------------------------------------+',
'| Something to | Some header with subheader |',
'| watch or +-----------------+-----------------------+',
'| idk | First | another text again |',
'| | | with one more line |',
'| | +-----------------------+',
'| | | and this | how it be |',
'+--------------+-----------------+-----------------------+']
It is just txt table with strange header. I need to transform it to this:
['Something to watch or idk', 'Some header with subheader First', 'Some header with subheader another text again with one more line and this', 'Some header with subheader another text again with one more line how it be']
Here's my first solution that make me closer to victory (you can see the comments my tries):
pluses = [i for i, element in enumerate(tmp) if element[0] == '+']
tmp2 = tmp[pluses[0]:pluses[1]+1].copy()
table_str=''.join(tmp[pluses[0]:pluses[1]+1])
col=[[i for i, symbol in enumerate(line) if symbol == '+' or symbol == '|'] for line in tmp2]
tmp3=[]
strt = ''.join(tmp2.copy())
table_list = [l.strip().replace('\n', '') for l in re.split(r'\+[+-]+', strt) if l.strip()]
for row in table_list:
joined_row = ['' for _ in range(len(row))]
for lines in [line for line in row.split('||')]:
line_part = [i.strip() for i in lines.split('|') if i]
joined_row = [i + j for i, j in zip(joined_row, line_part)]
tmp3.append(joined_row)
here's out:
tmp3
out[4]:
[['Something to', 'Some header with subheader'],
['Something towatch or'],
['idk', 'First', 'another text again'],
['idk', 'First', 'another text againwith one more line'],
['idk'],
['', '', 'and this', 'how it be']]
Remains only join this in the right way but idk how to...
Here's addon:
We can locate pluses and splitters by this:
col=[[i for i, symbol in enumerate(line) if symbol == '+' or symbol == '|'] for line in tmp2]
[[0, 15, 57],
[0, 15, 57],
[0, 15, 33, 57],
[0, 15, 33, 57],
[0, 15, 33, 57],
[0, 15, 33, 57],
[0, 15, 33, 45, 57],
[0, 15, 33, 57]]
And then we can split or group by cell but idk how to too... Please help
Example No.2:
+----------+------------------------------------------------------------+---------------+----------------------------------+--------------------+-----------------------+
| Number | longtextveryveryloooooong | aaaaaaaaaaa | bbbbbbbbbbbbbbbbbb | dfsdfgsdfddd |qqqqqqqqqqqqqqqqqqqqqq |
| string | | | ccccccccccccccccccccc | affasdd as |qqqqqqqqqqqqqqqqqqqqqq |
| | | | eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee,| seeerrrr e, | dfsdfffffffffffff |
| | | | anothertext and something | percent | ttttttttttttttttt |
| | | | (nothingtodo), | | sssssssssssssssssssss |
| | | | and text | |zzzzzzzzzzzzzzzzzzzzzz |
| | | +----------------------------------+ | b rererereerr ppppppp |
| | | | all | longtext wit- | | |
| | | | |h many character| | |
+----------+------------------------------------------------------------+---------------+-----------------+----------------+--------------------+-----------------------+
You could do it recursively - parsing each "sub table" at a time:
def parse_table(table, header='', root='', table_len=None):
# store length of original table
if not table_len:
table_len = len(table)
# end of current "column"
col = table[0].find('+', 1)
rows = [
row for row in range(1, len(table))
if table[row].startswith('+')
and table[row][col] == '+'
]
row = rows[0]
# split "line" contents into columns
# end of "line" is either `+` or final `|`
end = col
num_cols = table[0].count('+')
if num_cols != table[1].count('|'):
end = table[1].rfind('|')
columns = (line[1:end].split('|') for line in table[1:row])
# rebuild each column appending to header
content = [
' '.join([header] + [line.strip() for line in lines]).strip()
for lines in zip(*columns)
]
# is there a table below?
if row + 2 < len(table):
header = content[-1]
# if we are not the last table - we are a header
if len(rows) > 1:
header = content.pop()
# if we are the first table in column - we are the root
if not root:
root = header
next_table = [line[:col + 1] for line in table[row:]]
content.extend(
parse_table(
next_table,
header=header,
root=root,
table_len=table_len
)
)
# is there a table to the right?
if col + 2 < len(table[0]):
# find start line of next table
row = next(
row for row, line in enumerate(table, start=-1)
if line[col] == '|'
)
next_table = [line[col:] for line in table[row:]]
# new top-level table - reset root
if len(next_table) == table_len:
root = ''
# next table on same level - reset header
if len(table) == len(next_table):
header = root
content.extend(
parse_table(
next_table,
header=header,
root=root,
table_len=table_len
)
)
return content
Output:
>>> parse_table(table)
['Something to watch or idk',
'Some header with subheader First',
'Some header with subheader another text again with one more line and this',
'Some header with subheader another text again with one more line how it be']
>>> parse_table(big_table)
['Number string',
'longtextveryveryloooooong',
'aaaaaaaaaaa',
'bbbbbbbbbbbbbbbbbb ccccccccccccccccccccc eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee, anothertext and something (nothingtodo), and text all',
'bbbbbbbbbbbbbbbbbb ccccccccccccccccccccc eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee, anothertext and something (nothingtodo), and text longtext wit- h many character',
'dfsdfgsdfddd affasdd as seeerrrr e, percent',
'qqqqqqqqqqqqqqqqqqqqqq qqqqqqqqqqqqqqqqqqqqqq dfsdfffffffffffff ttttttttttttttttt sssssssssssssssssssss zzzzzzzzzzzzzzzzzzzzzz b rererereerr ppppppp']
>>> parse_table(planets)
['Planets Planet Sun (Solar) Earth Moon Mars',
'Planets R (km) 696000 6371 1737 3390',
'Planets mass (x 10^29 kg) 1989100000 5973.6 73.5 641.85']
As the input is in the format of a reStructuredText table, you could use the docutils table parser.
import docutils.parsers.rst.tableparser
from collections.abc import Iterable
def extract_texts(tds):
" recursively extract StringLists and join"
texts = []
for e in tds:
if isinstance(e, docutils.statemachine.StringList):
texts.append(' '.join([s.strip() for s in list(e) if s]))
break
if isinstance(e, Iterable):
texts.append(extract_texts(e))
return texts
>>> parser = docutils.parsers.rst.tableparser.GridTableParser()
>>> tds = parser.parse(docutils.statemachine.StringList(tmp))
>>> extract_texts(tds)
[[],
[],
[[['Something to watch or idk'], ['Some header with subheader']],
[['First'], ['another text again with one more line']],
[['and this | how it be']]]]
then flatten.
For a more general usage, it is interesting to give a look in tds (the structure returned by parse): some documentation there

text justification problem: chat with alternating alignment

I am trying to do a text justification on the following array and transform it into a chat system as pictured. So far I am able to get the alignment right but I am stuck on trying to make it looks like a chat conversation
"1", "2" indicate the user, and their message, and the array should determine the alternating order of the alignment. Width is the total width of a line, user width is the max width that a user can take in one line. user 1 is aligned to the left, and user 2 to the right.
So far I have the following code:
messages = [["1", "Bob hello"], ["2", "Alice hi"], ["1", "How is your life"], ["1", "Better than before"],
["2", "M super"], ["1", "Wow pro"]]
userWidth = 6
width = 15
def chat(messages, userWidth, width):
user1 = []
user2 = []
sep_num = []
order = []
for message in messages:
user = message[0]
convo = message[1]
windowStart = 0
sep_num.append(len(convo) // userWidth + 1)
order.append(user)
for windowEnd in range(len(convo)):
if windowEnd > 0:
if (windowEnd + 1) % userWidth == 0:
if user == "1":
left_aligned = convo[windowStart:windowEnd + 1]
user1.append(left_aligned)
else:
right_aligned = convo[windowStart:windowEnd + 1]
user2.append(right_aligned)
windowStart = windowEnd + 1
if windowEnd == len(convo) - 1:
if user == "1":
left_aligned = convo[windowStart:windowEnd + 1]
if len(left_aligned) == 1 and user1[-1][-3] != " ":
left_aligned = "".join([user1[-1][-1],left_aligned])
user1[-1] = user1[-1][:-1]
if len(left_aligned) == 1 and user1[-1][-3] == " ":
left_aligned = "".join([user1[-1][-3:], left_aligned])
user1[-1] = user1[-1][:-3]
user1.append(left_aligned)
else:
right_aligned = convo[windowStart:windowEnd + 1]
if len(right_aligned) == 1 and user2[-1][-3] != " ":
right_aligned = "".join([user2[-1][-1], right_aligned])
user2[-1] = user2[-1][:-1]
if len(right_aligned) == 1 and user1[-1][-3] == " ":
right_aligned = "".join([user1[-1][-3:], right_aligned])
user1[-1] = user1[-1][:-3]
user2.append(right_aligned)
constructor(user1, user2, width, order, sep_num)
def constructor(user1, user2, width, order, sep_num):
for i in range(len(user1)):
if (len(user1[i])) > 1:
if user1[i][0] == " ":
user1[i] = user1[i][1:]
space = width - len(user1[i])
line = "|" + user1[i] + (" " * space)
print(line)
for i in range(len(user2)):
if (len(user2[i])) > 1:
if user2[i][-1] == " ":
user2[i] = user2[i][:-1]
space = width - len(user2[i])
line = (" " * space) + user2[i] + "|"
print(line)
which makes it look like this:
|Bob he
|llo
|How is
|your
|life
|Better
|than
|before
|Wow
|pro
Alice|
hi|
M sup|
er|
But how can I transform it into the following:
You can try something like this:
from textwrap import wrap
messages = [["1", "Bob hello"], ["2", "Alice hi"], ["1", "How is your life"], ["1", "Better than before"],
["2", "M super"], ["1", "Wow pro"]]
win = 15
lw = 6
print("+" + "*" * win + "+")
for num, msg in messages:
pad = "<" if num == "1" else ">"
print("\n".join(f"|{s:{pad}{win}}|" for s in wrap(msg, lw)))
print("+" + "*" * win + "+")
It gives:
+***************+
|Bob |
|hello |
| Alice|
| hi|
|How is |
|your |
|life |
|Better |
|than |
|before |
| M|
| super|
|Wow |
|pro |
+***************+

Translating an EBNF grammar to pyparsing give error

I am making a parser to convert a simple DSL into elasticsearch query. some of the possible queries are:
response:success
response:success AND extension:php OR extension:css
response:sucess AND (extension:php OR extension:css)
time >= 2020-01-09
time >= 2020-01-09 AND response:success OR os:windows
NOT reponse:success
response:success AND NOT os:windows
I have written the following EBNF grammar for this :
<expr> ::= <or>
<or> ::= <and> (" OR " <and>)*
<and> ::= <unary> ((" AND ") <unary>)*
<unary> ::= " NOT " <unary> | <equality>
<equality> ::= (<word> ":" <word>) | <comparison>
<comparison> ::= "(" <expr> ")" | (<word> (" > " | " >= " | " < " | " <= ") <word>)+
<word> ::= ("a" | "b" | "c" | "d" | "e" | "f" | "g"
| "h" | "i" | "j" | "k" | "l" | "m" | "n"
| "o" | "p" | "q" | "r" | "s" | "t" | "u"
| "v" | "w" | "x" | "y" | "z")+
The precdence of operators in the DSL is:
() > NOT > AND > OR
aslo exact mathing i.e ':' has higher precedence than comparison operators.
I believe the above grammar capture the idea of my DSL. I am having a difficult time translating it to pyparsing, this is what i have now:
from pyparsing import *
AND = Keyword('AND') | Keyword('and')
OR = Keyword('OR') | Keyword('or')
NOT = Keyword('NOT') | Keyword('not')
word = Word(printables, excludeChars=':')
expr = Forward()
expr << Or
Comparison = Literal('(') + expr + Literal(')') + OneOrMore(word + ( Literal('>') | Literal('>=') | Literal('<') | Literal('<=')) + word)
Equality = (word + Literal(':') + word) | Comparison
Unary = Forward()
Unary << (NOT + Unary) | Equality
And = Unary + ZeroOrMore(AND + Unary)
Or = And + ZeroOrMore(OR + And)
The error i get is :
Traceback (most recent call last):
File "qql.py", line 54, in <module>
expr << Or
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pyparsing.py", line 5006, in __lshift__
self.mayIndexError = self.expr.mayIndexError
AttributeError: type object 'Or' has no attribute 'mayIndexError'
I think its becuase i am unable to understand Forward() correctly.
Questions: how can i correctly translate the above grammar to pyparsing?
**EDIT **: when i changed the pyparsing code to:
AND = Keyword('AND')
OR = Keyword('OR')
NOT = Keyword('NOT')
word = Word(printables, excludeChars=':')
expr = Forward()
Comparison = Literal('(') + expr + Literal(')') + OneOrMore(word + ( Literal('>') | Literal('>=') | Literal('<') | Literal('<=')) + word)
Equality = (word + Literal(':') + word) | Comparison
Unary = Forward()
Unary << ((NOT + Unary) | Equality)
And = Unary + ZeroOrMore(AND) + Unary
Or = And + ZeroOrMore(OR + And)
expr << Or
Q = """response : 200 \
AND extesnion: php \
OR extension: css \
"""
print(expr.parseString(Q))
I get this output:
['response', ':', '200', 'AND', 'extesnion', ':', 'php']
why OR expression is not parsed?

Append value only when not found

If a taxonomy in taxonomies is not in translations. I want it to print 152W00000X | Not Found currently all of the lines print with Not Found. if I remove the else I get an out of range error.
taxonomies = ['152W00000X', '156FX1800X', '200000000X', '261QD0000X', '3336C0003X', '333600000X', '261QD0000X']
translations = {'261QD0000X': 'Clinic/Center Dental', '3336C0003X': 'Pharmacy Community/Retail Pharmacy', '333600000X': 'Pharmacy'}
a = 0
final = []
for nums in taxonomies:
for i, v in translations.items():
if nums == i:
data = v
final.append(data)
else:
final.append('Not Found')
for nums in taxonomies:
print nums, "|", final[a]
a = a + 1
Current output is:
152W00000X | Not Found
156FX1800X | Not Found
200000000X | Not Found
261QD0000X | Not Found
3336C0003X | Not Found
333600000X | Not Found
261QD0000X | Not Found
The ideal output is:
152W00000X | Not Found
156FX1800X | Not Found
200000000X | Not Found
261QD0000X | Clinic/Center Dental
3336C0003X | Pharmacy Community/Retail Pharmacy
333600000X | Pharmacy
261QD0000X | Clinic/Center Dental
taxonomies = ['152W00000X', '156FX1800X', '200000000X', '261QD0000X', '3336C0003X', '333600000X', '261QD0000X']
translations = {'261QD0000X': 'Clinic/Center Dental', '3336C0003X': 'Pharmacy Community/Retail Pharmacy', '333600000X': 'Pharmacy'}
a = 0
final = []
for nums in taxonomies:
final.append(translations.get(nums, 'Not Found'))
for nums in taxonomies:
print nums, "|", final[a]
a = a + 1
I am using re to split IDVtaxo.txt at two or more spaces. Unless the source is actually delimited by tabs then this will work.
import re
with open('IDVtaxo.txt') as f:
idvtaxo = {re.split(r'\s{2,}', x)[0]: re.split(r'\s{2,}', x)[2] for x in f.read().splitlines()}
with open('taxonomies.txt') as f:
taxonomies = f.read().splitlines()
for taxonomy in taxonomies:
data = taxonomy.split('|')
tranlated = idvtaxo.get(data[1], 'Not Found')
print '%s|%s' % (taxonomy, tranlated)

Categories

Resources