Python - Returning String with recursion - python

The project that I am doing requires us to input a url and then follow the link in a particular position a number of number of times then return the last page visited. I have found the solution with a while loop, but now I am trying to do it with recursion.
Example: http://py4e-data.dr-chuck.net/known_by_Fikret.html
Find the link at position 3 (the first name is 1). Follow that link. Repeat this process 4 times. The answer is the last name that you retrieve.
Sequence of names: Fikret Montgomery Mhairade Butchi Anayah
Last name in sequence: Anayah
My code is this so far:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re
cnt = input("Enter count:")
post = input("Enter position:")
url = "http://py4e-data.dr-chuck.net/known_by_Fikret.html"
count = int(cnt)
position = int(post)
def GetLinks(initalPage, position, count):
html = urlopen(initalPage).read()
soup = BeautifulSoup(html, "html.parser")
temp = ""
links = list()
tags = soup('a')
for tag in tags:
x = tag.get('href', None)
links.append(x)
print(links[position - 1])
if count > 1:
GetLinks(links[position-1], position, count - 1)
return links[position - 1]
y = GetLinks(url, position, count)
print("****", y)
I see two problems with my code.
I am creating a list that is expanding with every recursion, which makes it very hard to locate the proper value.
Second, I am obviously returning the wrong item.
I don't know exactly how to fix this.

Related

getting specific part from a page source python

I am trying to extract a specific part from a page using regex but it isn't working.
This is the part I want to be extracted from the page:
{"clickTrackingParams":"CPcBEJhNIhMIwrDVo4qw3gIVTBnVCh28iAtzKPgd","commandMetadata":{"webCommandMetadata":{"url":"/service_ajax","sendPost":true}},"performCommentActionEndpoint":{"action":"CAUQAhoaVWd4MEdWUGNadTdvclcwT09WdDRBYUFCQWcqC1pNZlAzaERwdjlBMAA4AEoVMTA1MTc3MTgyMDc5MDg5MzQ1ODM4UACKAVQSC1pNZlAzaERwdjlBMixlaHBWWjNnd1IxWlFZMXAxTjI5eVZ6QlBUMVowTkVGaFFVSkJadyUzRCUzRMABAMgBAOABAaICDSj___________8BQAA%3D","clientActions":[{"updateCommentVoteAction":{"voteCount":{"accessibility":{"accessibilityData":{"label":"80 likes"}},"simpleText":"80"},"voteStatus":"LIKE"}}]}}
So far I've tried this :
import requests
import re
r = requests.get('http://rophoto.es/ash.txt')
html_source = r.text
mystrx = re.search(r'^{"clickTrackingParams".*"voteStatus":"LIKE"}}]}}', html_source)
but it didn't work out for me.
Try this:
import requests
import re
r = requests.get('http://rophoto.es/ash.txt')
html_source = r.text
fst, snd = '{"clickTrackingParams":', '"voteStatus":"LIKE"}}]}}'
# Find first occurence
end = html_source.find(snd)
# Get closest index
start = max(idx.start() for idx in re.finditer(fst, html_source) if idx.start() < end)
print(html_source[start:end+len(snd)])
Which Outputs:
{"clickTrackingParams":"CPcBEJhNIhMIwrDVo4qw3gIVTBnVCh28iAtzKPgd","commandMetadata":{"webCommandMetadata":{"url":"/service_ajax","sendPost":true}},"performCommentActionEndpoint":{"action":"CAUQAhoaVWd4MEdWUGNadTdvclcwT09WdDRBYUFCQWcqC1pNZlAzaERwdjlBMAA4AEoVMTA1MTc3MTgyMDc5MDg5MzQ1ODM4UACKAVQSC1pNZlAzaERwdjlBMixlaHBWWjNnd1IxWlFZMXAxTjI5eVZ6QlBUMVowTkVGaFFVSkJadyUzRCUzRMABAMgBAOABAaICDSj___________8BQAA%3D","clientActions":[{"updateCommentVoteAction":{"voteCount":{"accessibility":{"accessibilityData":{"label":"80 likes"}},"simpleText":"80"},"voteStatus":"LIKE"}}]}}
If you want to get the second occurence, you can try something along the lines of:
import requests
import re
r = requests.get('http://rophoto.es/ash.txt')
html_source = r.text
fst, snd = '{"clickTrackingParams":', '"voteStatus":"LIKE"}}]}}'
def find_nth(string, to_find, n):
"""
Finds nth match from string
"""
# find all occurences
matches = [idx.start() for idx in re.finditer(to_find, string)]
# return nth match
return matches[n]
# finds second match
end = find_nth(html_source, snd, 1)
# Gets closest index to end
start = max(idx.start() for idx in re.finditer(fst, html_source) if idx.start() < end)
print(html_source[start:end+len(snd)])
Note: In the second example you can run into IndexError's if you request an occurence outside of the found matches. You will need to handle this behaviour yourself.

Beautifulsoup scrape content of a cell beside another one

I am trying to scrape the content of a cell besides another cell of which I know the name e.g. "Staatsform", "Amtssprache", "Postleitzahl" etc. In the picture the needed content is always in the right cell.
The basic code is the following one, but I am stuck with it:
source_code = requests.get('https://de.wikipedia.org/wiki/Hamburg')
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
stastaform = soup.find(text="Staatsform:")...???
Many thanks in advance!
I wanted to exercise care in limiting the search to what is called the 'Infobox' in the English-language wikipedia. Therefore, I searched first for the heading 'Basisdaten', requiring that it be a th element. Not exactly definitive perhaps but more likely to be. Having found that I looked for tr elements under 'Basisdaten' until I found another tr including a (presumed different) heading. In this case, I search for 'Postleitzahlen:' but this approach makes it possible to find any/all of the items between 'Basisdaten' and the next heading.
PS: I should also mention the reason for if not current.name. I noticed some lines consisting of just new lines which BeautifulSoup treats as strings. These don't have names, hence the need to treat them specially in code.
import requests
import bs4
page = requests.get('https://de.wikipedia.org/wiki/Hamburg').text
soup = bs4.BeautifulSoup(page, 'lxml')
def getInfoBoxBasisDaten(s):
return str(s) == 'Basisdaten' and s.parent.name == 'th'
basisdaten = soup.find_all(string=getInfoBoxBasisDaten)[0]
wanted = 'Postleitzahlen:'
current = basisdaten.parent.parent.nextSibling
while True:
if not current.name:
current = current.nextSibling
continue
if wanted in current.text:
items = current.findAll('td')
print (items[0])
print (items[1])
if '<th ' in str(current): break
current = current.nextSibling
Result like this: two separate td elements, as requested.
<td>Postleitzahlen:</td>
<td>20095–21149,<br/>
22041–22769,<br/>
27499</td>
This works most of the time:
def get_content_from_right_column_for_left_column_containing(text):
"""return the text contents of the cell adjoining a cell that contains `text`"""
navigable_strings = soup.find_all(text=text)
if len(navigable_strings) > 1:
raise Exception('more than one element with that text!')
if len(navigable_strings) == 0:
# left-column contents that are links don't have a colon in their text content...
if ":" in text:
altered_text = text.replace(':', '')
# but `td`s and `th`s do.
else:
altered_text = text + ":"
navigable_strings = soup.find_all(text=altered_text)
try:
return navigable_strings[0].find_parent('td').find_next('td').text
except IndexError:
raise IndexError('there are no elements containing that text.')

For Loop doesn't spit out needed results

I got this piece of code to spit out the unique "area number" in the URL. However, the loop doesn't work. It spits out the same number, please see below:
import urllib3
from bs4 import BeautifulSoup
http = urllib3.PoolManager()
url = open('MS Type 1 URL.txt',encoding='utf-8-sig')
links = []
for link in url:
y = link.strip()
links.append(y)
url.close()
print('Amount of Links: ', len(links))
for x in links:
j = (x.find("=") + 1)
g = (x.find('&housing'))
print(link[j:g])
Results are:
http://millersamuel.com/aggy-data/home/query_report?area=38&housing_type=3&measure=4&query_type=quarterly&region=1&year_end=2020&year_start=1980
23
http://millersamuel.com/aggy-data/home/query_report?area=23&housing_type=1&measure=4&query_type=annual&region=1&year_end=2020&year_start=1980
23
As you can see it spits out the area number '23' which is only in one of this URL but not the '38' of the other URL.
There's a typo in your code. You iterate over links list and bind its elements to x variable, but print a slice of link variable, so you get the same string printed on each loop iteration. So you can change print(link[j:g]) to print(x[j:g]), but it's better to call your variables with more descriptive names, so here's the fixed version of your loop:
for link in links:
j = link.find('=') + 1
g = link.find('&housing')
print(link[j:g])
And I also want to show you a proper way to extract area value from URLs:
from urllib.parse import urlparse, parse_qs
url = 'http://millersamuel.com/aggy-data/home/query_report?area=38&housing_type=3&measure=4&query_type=quarterly&region=1&year_end=2020&year_start=1980'
area = parse_qs(urlparse(url).query)['area'][0]
So instead of using str.find method, you can write this:
for url in urls:
parsed_qs = parse_qs(urlparse(url).query)
if 'area' in parsed_qs:
area = parsed_qs['area'][0]
print(area)
Used functions:
urllib.urlparse
urllib.parse_qs
You need to change:
print(link[j:g]) to print(x[j:g])

Link Fetching List

so I've asked many questions regarding this one subject, and I'm sorry. But this is it.
So I have this code:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import sys
from collections import defaultdict
m_num=int(input('Enter number of monsters to look up: '))
for x in range(m_num):
name=input("Enter a monster's name: ")
url_name=name.replace(' ','_')
url=('http://yugioh.wikia.com/wiki/Card_Tips:{}'.format(url_name))
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page.read())
content = soup.find('div',id='mw-content-text')
links = content.findAll('a')
link_lists = defaultdict(list)
for link in links:
link_lists[x].append(link.get('title'))
all_lists = list(link_lists.values())
common_links = set(all_lists[0]).intersection(*all_lists[1:])
print('common links: ',common_links)
What I'm trying to do is for how many number of monsters the user specifies is how many lists are creatd. Each list is then filled with all the links from that specific site. And then in the ned all the lists are compared to see if they have similar strings inside of them. (Hopefully that makes sense).
So the problem I'm having is that when I run it and it gets to the print('common links:',common_links) part it only prints out the last list. It doesn't compare the lists nor does it even recognize that the other lists were created.
Can anyone lend a helping hand? I've been troubleshooting this and I'm just stuck.
link_lists refers to a new dictionary on each iteration. You could exclude it: put all_lists = [] before the for x in range(m_num) loop. And replace the last 3 line in the loop with: all_lists.append([link.get("title") for link in links]) Note: you don't need to know m_num in this case:
all_lists = []
for name in iter(lambda: input("monster name"), ""): # loop until empty name
# ...
titles = [link["title"] for link in content.findAll('a', title=True)]
all_lists.append(titles)

Using Regex to Search for HTML links near keywords

If I'm looking for the keyword "sales" and I want to get the nearest "http://www.somewebsite.com" even if there is multiple links in the file. I want the nearest link not the first link. This means I need to search for the link that comes just before the keyword match.
This doesn't work...
regex = (http|https)://[-A-Za-z0-9./]+.*(?!((http|https)://[-A-Za-z0-9./]+))sales
sales
Whats the best way to find a link that is closest to a keyword?
It is generally much easier and more robust to use an HTML parser rather than regex.
Using the third-party module lxml:
import lxml.html as LH
content = '''<html>
<p>other stuff</p><p>sales</p>
</html>
'''
doc = LH.fromstring(content)
for url in doc.xpath('''
//*[contains(text(),"sales")]
/preceding::*[starts-with(#href,"http")][1]/#href'''):
print(url)
yields
http://www.somewebsite.com
I find lxml (and XPath) a convenient way to express what elements I'm looking for. However, if installing a third-party module is not an option, you could also accomplish this particular job with HTMLParser from the standard library:
import HTMLParser
import contextlib
class MyParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.last_link = None
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if 'href' in attrs:
self.last_link = attrs['href']
content = '''<html>
<p>other stuff</p><p>sales</p>
</html>
'''
idx = content.find('sales')
with contextlib.closing(MyParser()) as parser:
parser.feed(content[:idx])
print(parser.last_link)
Regarding the XPath used in the lxml solution: The XPath has the following meaning:
//* # Find all elements
[contains(text(),"sales")] # whose text content contains "sales"
/preceding::* # search the preceding elements
[starts-with(#href,"http")] # such that it has an href attribute that starts with "http"
[1] # select the first such <a> tag only
/#href # return the value of the href attribute
I don't think you can do this one with regex alone (especially looking before the keyword match) as it has no sense of comparing distances.
I think you're best off doing something like this:
find all occurences of sales & get substring index, called salesIndex
find all occurences of https?://[-A-Za-z0-9./]+ and get the substring index, called urlIndex
loop through salesIndex. For each location i in salesIndex, find the urlIndex closest.
Depending on how you want to judge "closest" you may need to get the start and end indices of the sales and http... occurences to compare. i.e., find the end index of a URL that is closest to the start index of the current occurence of sales, and find the start index of a URL that is closest to the end index of the current occurence of sales, and pick the one that is closer.
You can use matches = re.finditer(pattern,string,re.IGNORECASE) to get a list of matches, and then match.span() to get the start/end substring indices for each match in matches.
Building on what mathematical.coffee suggested, you could try something along these lines:
import re
myString = "" ## the string you want to search
link_matches = re.finditer('(http|https)://[-A-Za-z0-9./]+',myString,re.IGNORECASE)
sales_matches = re.finditer('sales',myString,re.IGNORECASE)
link_locations = []
for match in link_matches:
link_locations.append([match.span(),match.group()])
for match in sales_matches:
match_loc = match.span()
distances = []
for link_loc in link_locations:
if match_loc[0] > link_loc[0][1]: ## if the link is behind your keyword
## append the distance between the END of the keyword and the START of the link
distances.append(match_loc[0] - link_loc[0][1])
else:
## append the distance between the END of the link and the START of the keyword
distances.append(link_loc[0][0] - match_loc[1])
for d in range(0,len(distances)-1):
if distances[d] == min(distances):
print ("Closest Link: " + link_locations[d][1] + "\n")
break
I tested out this code and it seems to be working...
def closesturl(keyword, website):
keylist = []
urllist = []
closest = []
urls = []
urlregex = "(http|https)://[-A-Za-z0-9\\./]+"
urlmatches = re.finditer(urlregex, website, re.IGNORECASE)
keymatches = re.finditer(keyword, website, re.IGNORECASE)
for n in keymatches:
keylist.append([n.start(), n.end()])
if(len(keylist) > 0):
for m in urlmatches:
urllist.append([m.start(), m.end()])
if((len(keylist) > 0) and (len(urllist) > 0)):
for i in range (0, len(keylist)):
closest.append([abs(urllist[0][0]-keylist[i][0])])
urls.append(website[urllist[0][0]:urllist[0][1]])
if(len(urllist) >= 1):
for j in range (1, len(urllist)):
if((abs(urllist[j][0]-keylist[i][0]) < closest[i])):
closest[i] = abs(keylist[i][0]-urllist[j][0])
urls[i] = website[urllist[j][0]:urllist[j][1]]
if((abs(urllist[j][0]-keylist[i][0]) > closest[i])):
break # local minimum / inflection point break from url list
if((len(keylist) > 0) and (len(urllist) > 0)):
return urls #return website[urllist[index[0]][0]:urllist[index[0]][1]]
else:
return ""
somestring = "hey whats up... http://www.firstlink.com some other test http://www.secondlink.com then mykeyword"
keyword = "mykeyword"
print closesturl(keyword, somestring)
The above when run shows... http://www.secondlink.com.
If someones got ideas on how to speed up this code that would be awesome!
Thanks
V$H.

Categories

Resources