I created a HTML text cleaner, which deletes data between tags.
It's working fine on one iteration, but not in a loop.
The problem is, I cannot save newhtml as a variable due to Python's string immutability.
So, my loop is only working for the last iteration of the function return.
What would be the best practice in such a situation?
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += len(sub) # use start += 1 to find overlapping matches
def replace_string(index1, index2, mainstring):
replacementstring = ''
return mainstring.replace(mainstring[index1:index2], replacementstring)
def strip_images(html):
begin_indexes = list(find_all(html, '<DESCRIPTION>GRAPHIC'))
end_indexes = list(find_all(html, '</TEXT>'))
for i in range(len(begin_indexes)):
if begin_indexes[i] > end_indexes[i]:
end_indexes.pop(0)
else:
if len(begin_indexes) == len(end_indexes):
break
for i in range(len(begin_indexes)):
#code problem is here--
newhtml = replace_string(begin_indexes[i],end_indexes[i], html)
if i == len(begin_indexes) - 1:
return newhtml
#code only returns one iteration
var = strip_images(html)
print var
Your current issue is that html never changes within the loop. So, your input will always be for the first iteration, regardless of the length of the lists.
The solution here follows these steps
Assign the string to the original value before the loop
Edit within the loop, passing in the current content, returning a replaced string
Return from the function after the loop
newhtml = html
for begin, end in zip(begin_indexes, end_indexes):
newhtml = replace_string(begin, end, newhtml)
return newhtml
Got it working, here's the code snippet. It's not pretty but it's doing the job of removing text between those two tags:
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += len(sub) # use start += 1 to find overlapping matches
def strip_images(html):
begin_indexes = list(find_all(html, '<DESCRIPTION>GRAPHIC'))
end_indexes = list(find_all(html, '</TEXT>'))
for i in range(len(begin_indexes)):
if begin_indexes[i] > end_indexes[i]:
end_indexes.pop(0)
else:
if len(begin_indexes) == len(end_indexes):
break
newhtml = html
begin_indexes2 = begin_indexes[::-1]
end_indexes2 = end_indexes[::-1]
for i in range(len(begin_indexes2)):
#for i, value in enumerate(begin_indexes,0):
#end_indexes.reset_index(drop=True)
newhtml = list(newhtml)
del newhtml[begin_indexes2[i]:end_indexes2[i]]
if i == len(begin_indexes2) - 1:
str1 = ''.join(newhtml)
return str1
Related
Im coding a madlibs exercise and it is not returning the desired outcome. It should replace the words NOUN and VERB with random verbs and nouns that are defined in the functions in the code.
I have created two test sentences and after running the code I only get the first character of both sentences. I cant think for the life of me why!!
from random import randint
def random_verb():
random_num = randint(0, 1)
if random_num == 0:
return "run"
else:
return "kayak"
def random_noun():
random_num = randint(0,1)
if random_num == 0:
return "sofa"
else:
return "llama"
def word_transformer(word):
if word == "NOUN":
return random_noun()
elif word == "VERB":
return random_verb()
else:
return word[0]
def process_madlib(madlib):
#the finished sentence
processed = ""
#starting point
index = 0
#length to cut from
box_length = 4
#
while index<len(madlib):
#what you cut off from string
frame = madlib[index:index+box_length]
#put to string
to_add = word_transformer(frame)
processed += to_add
if len(to_add) == 1:
index +=1
else:
index +=5
return processed
# your code here
# you may find the built-in len function useful for this quiz
# documentation: https://docs.python.org/2/library/functions.html#len
test_string_1 = "This is a good NOUN to use when you VERB your food"
test_string_2 = "I'm going to VERB to the store and pick up a NOUN or two."
print process_madlib(test_string_1)
print process_madlib(test_string_2)
the outcome is
T
I
Your return is not correctly placed.
The return processed is inside the while loop - so after the first iteration of your while loop you will always return the value - which is just the first letter in the sentence.
You need to place it outside.
def process_madlib(madlib):
#the finished sentence
processed = ""
#starting point
index = 0
#length to cut from
box_length = 4
#
while index<len(madlib):
#what you cut off from string
frame = madlib[index:index+box_length]
#put to string
to_add = word_transformer(frame)
processed += to_add
if len(to_add) == 1:
index +=1
else:
index +=5
return processed # <---- THIS IS WHAT WAS CHANGED
This gives the output:
This is a good sofato use when you runyour food
I'm going to kayakto the store and pick up a llamaor two.
I am developing a breadth-first-search algorithm for a factorization problem and am running into an interesting/confusing bug when attempting to break out of a while loop. If you run the code below, it will fail inside the "construct_path" method, stating :
File "main.py", line 96
break
SyntaxError: 'break' outside loop
but I am inside of a while loop! If anyone could give me some advice on this issue, I would really appreciate it. Thanks in advance.
from numpy import random
import itertools
import Queue
#Finding multiples, BFS problem
#Given input of list with unique integers 0 - 9 and n = range(0,1000000), calculate smallest multiple of n and unique combination of values in the list
#Example : Input : list = {0,1,2} , n = 3,
# output = 12
# Input : list = {0,1,2} , n = 50
# Output = 200
class Problem:
def __init__(self):
self.n = random.randint(0,10000000)
listSize = random.randint(1,9)
mainSet = set()
self.mainList = []
while True:
toAdd = random.randint(0,9)
if(toAdd not in self.mainList):
self.mainList.append(toAdd)
if(len(self.mainList) == listSize):
break
def get_start_state(self):
s = ''.join(map(str, self.mainList))
return int(s)
def is_goal(self, state):
return True
def get_sucessors(self):
print "Getting successors"
def breadth_first_search(problem):
# a FIFO open_set
open_set = Queue.Queue()
# an empty set to maintain visited nodes
closed_set = set()
# a dictionary to maintain meta information (used for path formation)
meta = dict() # key -> (parent state, action to reach child)
# initialize
start = problem.get_start_state()
meta[start] = (None, None)
open_set.put(start)
while not open_set.empty():
parent_state = open_set.get()
print "{} {}".format("parent_state is ", parent_state)
if problem.is_goal(parent_state):
return construct_path(parent_state, meta)
for (child_state, action) in problem.get_successors(parent_state):
if child_state in closed_set:
continue
if child_state not in open_set:
meta[child_state] = (parent_state, action)
open_set.put(child_state)
closed_set.add(parent_state)
#collect path to desired answer
def construct_path(state, meta):
action_list = list()
while True:
row = meta[state]
if (len(row) == 2):
state = row[0]
action = row[1]
action_list.append(action)
else:
break
return action_list.reverse()
x = Problem()
breadth_first_search(x)
Could be that you have a mix of tabs and spaces so that the break in line 96 looks like it is indented to be below action_list.append(action) but effectively it is below the while. That would explain the error at least.
It is just a guess. But it could be like this, using a visible tabwidth of 4 in the editor:
→ while True:
→ → row = meta[state]
if (len(row) == 2):
state = row[0]
action = row[1]
action_list.append(action)
else:
break
To the Python interpreter this looks like this (because it assumes a tabwidth of 8):
→ while True:
→ → row = meta[state]
if (len(row) == 2):
state = row[0]
action = row[1]
action_list.append(action)
else:
break
This is still valid but obviously means a different thing and would put your break outside of the while loop.
I received an interesting challenge in an algorithm Meetup. Given an input string, return a string in which all substrings within brackets have been replicated n times, where n is the integer outside the brackets. Characters outside brackets should simply be concatenated to the substring inside. For example:
2[ab] should return abab
a[3[bc]] should return abcbcbc
2[ab[cd]] should return abcdabcd
I've started implementing the solution using a stack, but I've got the feeling that my approach of checking each de-stacked character for a bracket is off, anyone have any suggestions? Code is below
class Stack:
def __init__(self):
self.items = []
def push(self, item):
self.items.append(item)
def pop(self):
return self.items.pop()
def length(self):
return len(self.items)
def is_number(s):
try:
int(s)
return True
except ValueError:
return False
def character_math(charstr):
final_output = ""
substring = ""
for i in charstr:
myStack.push(i)
for m in range(myStack.length() - 2):
destacked = myStack.pop()
# We want to go to the inner-most right bracket
if destacked != "]":
substring += destacked
if destacked == "[":
possible_multiplier = myStack.pop()
if is_number(possible_multiplier):
final_output += int(possible_multiplier) * substring
else:
final_output += possible_multiplier[::-1]
break
final_output += substring[::-1]
return "Final output is ", final_output
myStack = Stack()
# 3[ab[cd]] should return 'abcdabcd'
sample_str = '2[ab[cd]]'
print(character_math(sample_str))
The best way to do that is to use a recursive algorithm. The idea is to repeat a function until a condition is match. Here is the code I used, it works on your examples, and I don't think I forgot one of the possibilities.
# -*-coding:Utf-8 -*
Input = "2[ab[cd]]"
def Treatment(STR):
# Exit the treatment. That's the end condition.
if "[" not in STR:
return STR
# Find the inner [], in this case, the "cd" part
Bound1_ID = len(STR) - STR[::-1].index("[") - 1
Bound2_ID = STR.index("]")
# Separate STR into : First_part + middle between [] + Last_part
Last_part = STR[Bound2_ID + 1:]
# First_part depends if there is a number or not
try:
Multiplier = int(STR[Bound1_ID - 1])
First_part = STR[:Bound1_ID - 1]
except:
Multiplier = 1
First_part = STR[:Bound1_ID]
Middle_part = STR[Bound1_ID + 1: Bound2_ID] * Multiplier
# Assemble the new STR :
New_STR = First_part + Middle_part + Last_part
# Recursive command, repeat the function on the new STR
return Treatment(New_STR)
print (Treatment(Input))
EDIT : That's what it does :
First iteration : "2[ab[cd]]"
Second iteration : "2[abcd]"
Third iteration : abcdabcd => No more "[" so stop here.
i've searched the forum and found similar questions, but no luck in solving my problem.
My code is designed to swap every two letters of each word using recursion and print the result. For words with an even amount of letters, the word "None" is included in the output and i don't know how to fix...
here's the code:
def encryptLine(line, count):
headline = line[count:]
if length(headline) > 0:
if count == length(line) - 1:
new = headline
return new
elif count <= length(line):
new = head(tail(headline)) + head(headline)
new = new + str(encryptLine(line, count+2))
return new
print(encryptLine('abcd', 0))
the output for 'abcd' is badcNone, which is correct except for the word None. the output for 'abcde' is 'badce', which is correct...
thanks in advance for your help!
Add return "" to the function definition, that is
def encryptLine(line, count):
headline = line[count:]
if length(headline) > 0:
if count == length(line) - 1:
new = headline
return new
elif count <= length(line):
new = head(tail(headline)) + head(headline)
new = new + str(encryptLine(line, count+2))
return new
return ""
Otherwise, the function will return None if length(headline) > 0 does not hold.
None is here because your function return nothing.
There is 1 case where you return nothing it is
if length(headline) <= 0:
In Python, if there is no return to a function and you try to access to a return value, the value will be None.
I have an assignment said that to create a findString function that accept 2 string which are 'target' and 'query', and that returns
a
list
of
all
indices
in
target
where
query
appears.
If
target
does
not
contain
query,
return
an
empty
list.
For example:
findString(‘attaggtttattgg’,’gg’)
return:
[4,12]
I dont know how to start off with this function writing at all. Please help me everyone. Thank you so much!!!
since an answer has already been given:
def find_matches(strng, substrng):
substrg_len = len(substr)
return [i for i in range(len(strg) + 1 - substrg_len)
if strg[i:i+substrg_len] == substrg]
def find_string(search, needle):
start = -1
results = []
while start + 1< len(search):
start = search.find(needle, start +1)
if start == -1:
break
results.append(start )
return results
Here are a couple of hints to get you started.
target.find(query) will return the index of query in target. If query is not found, it will return -1.
A string can be sliced. target[pos:] will give you a substring of target starting from pos.
This may require some error handling:
def find_allPatterns(strVal, strSub):
listPos = []
strTemp = strVal
while True:
try:
posInStr = strTemp.index(strSub)
except ValueError:
posInStr = None
if posInStr:
listPos.append(posInStr)
subpos = posInStr + len(strSub)
strTemp = strTemp[subpos:]
else:
break
return listPos
print find_allPatterns('attaggtttattgg', 'gg')
Output:
[4, 6]