I am trying to extract a specific part from a page using regex but it isn't working.
This is the part I want to be extracted from the page:
{"clickTrackingParams":"CPcBEJhNIhMIwrDVo4qw3gIVTBnVCh28iAtzKPgd","commandMetadata":{"webCommandMetadata":{"url":"/service_ajax","sendPost":true}},"performCommentActionEndpoint":{"action":"CAUQAhoaVWd4MEdWUGNadTdvclcwT09WdDRBYUFCQWcqC1pNZlAzaERwdjlBMAA4AEoVMTA1MTc3MTgyMDc5MDg5MzQ1ODM4UACKAVQSC1pNZlAzaERwdjlBMixlaHBWWjNnd1IxWlFZMXAxTjI5eVZ6QlBUMVowTkVGaFFVSkJadyUzRCUzRMABAMgBAOABAaICDSj___________8BQAA%3D","clientActions":[{"updateCommentVoteAction":{"voteCount":{"accessibility":{"accessibilityData":{"label":"80 likes"}},"simpleText":"80"},"voteStatus":"LIKE"}}]}}
So far I've tried this :
import requests
import re
r = requests.get('http://rophoto.es/ash.txt')
html_source = r.text
mystrx = re.search(r'^{"clickTrackingParams".*"voteStatus":"LIKE"}}]}}', html_source)
but it didn't work out for me.
Try this:
import requests
import re
r = requests.get('http://rophoto.es/ash.txt')
html_source = r.text
fst, snd = '{"clickTrackingParams":', '"voteStatus":"LIKE"}}]}}'
# Find first occurence
end = html_source.find(snd)
# Get closest index
start = max(idx.start() for idx in re.finditer(fst, html_source) if idx.start() < end)
print(html_source[start:end+len(snd)])
Which Outputs:
{"clickTrackingParams":"CPcBEJhNIhMIwrDVo4qw3gIVTBnVCh28iAtzKPgd","commandMetadata":{"webCommandMetadata":{"url":"/service_ajax","sendPost":true}},"performCommentActionEndpoint":{"action":"CAUQAhoaVWd4MEdWUGNadTdvclcwT09WdDRBYUFCQWcqC1pNZlAzaERwdjlBMAA4AEoVMTA1MTc3MTgyMDc5MDg5MzQ1ODM4UACKAVQSC1pNZlAzaERwdjlBMixlaHBWWjNnd1IxWlFZMXAxTjI5eVZ6QlBUMVowTkVGaFFVSkJadyUzRCUzRMABAMgBAOABAaICDSj___________8BQAA%3D","clientActions":[{"updateCommentVoteAction":{"voteCount":{"accessibility":{"accessibilityData":{"label":"80 likes"}},"simpleText":"80"},"voteStatus":"LIKE"}}]}}
If you want to get the second occurence, you can try something along the lines of:
import requests
import re
r = requests.get('http://rophoto.es/ash.txt')
html_source = r.text
fst, snd = '{"clickTrackingParams":', '"voteStatus":"LIKE"}}]}}'
def find_nth(string, to_find, n):
"""
Finds nth match from string
"""
# find all occurences
matches = [idx.start() for idx in re.finditer(to_find, string)]
# return nth match
return matches[n]
# finds second match
end = find_nth(html_source, snd, 1)
# Gets closest index to end
start = max(idx.start() for idx in re.finditer(fst, html_source) if idx.start() < end)
print(html_source[start:end+len(snd)])
Note: In the second example you can run into IndexError's if you request an occurence outside of the found matches. You will need to handle this behaviour yourself.
Related
The project that I am doing requires us to input a url and then follow the link in a particular position a number of number of times then return the last page visited. I have found the solution with a while loop, but now I am trying to do it with recursion.
Example: http://py4e-data.dr-chuck.net/known_by_Fikret.html
Find the link at position 3 (the first name is 1). Follow that link. Repeat this process 4 times. The answer is the last name that you retrieve.
Sequence of names: Fikret Montgomery Mhairade Butchi Anayah
Last name in sequence: Anayah
My code is this so far:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re
cnt = input("Enter count:")
post = input("Enter position:")
url = "http://py4e-data.dr-chuck.net/known_by_Fikret.html"
count = int(cnt)
position = int(post)
def GetLinks(initalPage, position, count):
html = urlopen(initalPage).read()
soup = BeautifulSoup(html, "html.parser")
temp = ""
links = list()
tags = soup('a')
for tag in tags:
x = tag.get('href', None)
links.append(x)
print(links[position - 1])
if count > 1:
GetLinks(links[position-1], position, count - 1)
return links[position - 1]
y = GetLinks(url, position, count)
print("****", y)
I see two problems with my code.
I am creating a list that is expanding with every recursion, which makes it very hard to locate the proper value.
Second, I am obviously returning the wrong item.
I don't know exactly how to fix this.
If I was to print "m," there would be a result that begins with "Histology" and ends with a period. Despite that, the output shows up empty.
from bs4 import BeautifulSoup
from googlesearch import search
import requests
from goose3 import Goose
def search_google(query):
parent_=[]
for j in search(query, tld="co.in", num=10, stop=5, pause=2):
child_=[]
link_=j
site_name=link_.split("/")[2]
child_.append(site_name)
child_.append(link_)
parent_.append(child_)
g = Goose()
article = g.extract(link_)
m = article.cleaned_text
Answer = re.findall(r'\bHistology\s+([^.]*)',m)
print(Answer)
f = search_google("""'Histology'""")
Output: []
It seems your answer variable has incorrect indentation, and your last result has no matches in the cleaned text. This is why your print results in a empty list.
The print command, since it sits outside of the loop only triggers once. And given the final value of Answer has no matches, you are returned an empty list.
Indent the answer variable by 1 and it should output the correct result.
Your regex will also only match the sentence following Histology and not include the word itself. This is due to you specifying a capture group without Histology included. You can resolve this by removing the capturing group.
r'\bHistology\s+[^.]*'
from bs4 import BeautifulSoup
from googlesearch import search
import requests
from goose3 import Goose
def search_google(query):
parent_=[]
for j in search(query, tld="co.in", num=10, stop=5, pause=2):
child_=[]
link_=j
site_name=link_.split("/")[2]
child_.append(site_name)
child_.append(link_)
parent_.append(child_)
g = Goose()
article = g.extract(link_)
m = article.cleaned_text
Answer = re.findall(r'\bHistology\s+[^.]*',m)
print(Answer)
f = search_google("""'Histology'""")
To print all results on individual lines you can change print(Answer) to print('\n'.join(Answer))
Looking at the site, i suppose not to see an error because each local language(Yoruba) as it Meaning and Translation, and there are 220 local language(Yoruba).
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
res = requests.get('http://yoruba.unl.edu/yoruba.php-text=1a&view=0&uni=0&l=1.htm')
soup = BeautifulSoup(res.content,'html.parser')
edu = {'Yoruba':[],'Translation':[],'Meaning':[]}
# first loop
for br in soup.select('p > br:nth-of-type(1)'):
text = br.previous_sibling.strip()
edu['Yoruba'].append(text)
# second loop
for br in soup.select('p > br:nth-of-type(2)'):
text = br.previous_sibling
if isinstance(text, str):
edu['Translation'].append(text.strip())
# third loop
for br in soup.select('p > br:nth-of-type(3)'):
text = br.previous_sibling
if isinstance(text, str):
edu['Meaning'].append(re.sub(r'[\(\)]','',str(text.strip())))
df7 = pd.DataFrame(edu)
Error
ValueError: arrays must all be same length
Since each of the three keys has different length, I guess the best way to address it is to pad the short keys to the length of the longest key (220, in this case). To do that add the following right before creating your dataframe:
length = max(len(edu['Meaning']),len(edu['Translation']),len(edu['Yoruba'])) #in case you don't know, find the length of the longest key
for k in edu:
for i in range(length-len(edu[k])):
edu[k].append("NA") # this is where the padding is; you can replacing NA with anything else, obviously
df7 = pd.DataFrame.from_dict(edu) #since edu is a dictionary, I would use this method
df7
Let me know if that works.
I have a little script in python3.7 (see related question here) that scrapes links from a website (http://digesto.asamblea.gob.ni/consultas/coleccion/) and saves them in a list. Unfortunately, they are only partial and I have to trim them to use them as links.
This is the relevant part of the script:
list_of_links = [] # will hold the scraped links
tld = 'http://digesto.asamblea.gob.ni'
current_url = driver.current_url # for any links not starting with /
table_id = driver.find_element(By.ID, 'tableDocCollection')
rows = table_id.find_elements_by_css_selector("tbody tr") # get all table rows
for row in rows:
row.find_element_by_css_selector('button').click()
link = row.find_element_by_css_selector('li a[onclick*=pdf]').get_attribute("onclick") # href
print(list_of_links)# trim
if link.startswith('/'):
list_of_links.append(tld + link)
else:
list_of_links.append(current_url + link)
row.find_element_by_css_selector('button').click()
print(list_of_links)
How can I manipulate the list (as an example only with three entries here) that this
["http://digesto.asamblea.gob.ni/consultas/coleccion/window.open('/consultas/util/pdf.php?type=rdd&rdd=p2%2FHzlqau8A%3D');return false;", "http://digesto.asamblea.gob.ni/consultas/coleccion/window.open('/consultas/util/pdf.php?type=rdd&rdd=Z%2FgLeZxynkg%3D');return false;", "http://digesto.asamblea.gob.ni/consultas/coleccion/window.open('/consultas/util/pdf.php?type=rdd&rdd=9rka%2BmYwvYM%3D');return false;"]
looks like
["http://digesto.asamblea.gob.ni/consultas/util/pdf.php?type=rdd&rdd=p2%2FHzlqau8A%3D", "http://digesto.asamblea.gob.ni/consultas/util/pdf.php?type=rdd&rdd=Z%2FgLeZxynkg%3D", "http://digesto.asamblea.gob.ni/consultas/util/pdf.php?type=rdd&rdd=9rka%2BmYwvYM%3D"]
Breaking it down: on the example of the first link, I get this link from the website basically as
http://digesto.asamblea.gob.ni/consultas/coleccion/window.open('/consultas/util/pdf.php?type=rdd&rdd=p2%2FHzlqau8A%3D');return false;
and need to trim it to
http://digesto.asamblea.gob.ni/consultas/util/pdf.php?type=rdd&rdd=p2%2FHzlqau8A%3D.
How do I achieve this in python from the entire list?
One approach is to split on the string /consultas/coleccion/window.open(', remove the unwanted end of the second string and concatenate the two processed strings to get your result.
This should do it:
new_links = []
for link in list_of_links:
current_strings = link.split("/consultas/coleccion/window.open('")
current_strings[1] = current_strings[1].split("');return")[0]
new_link = current_strings[0] + current_strings[1]
new_links.append(new_link)
This should do the trick:
s = "http://digesto.asamblea.gob.ni/consultas/coleccion/window.open('/consultas/util/pdf.php?type=rdd&rdd=p2%2FHzlqau8A%3D');return false;"
s = s.replace("/consultas/coleccion/window.open('", "").replace("');return false;", "")
You could use a regular expression, to split the URLs in your list and let urllib.parse.urljoin() make the rest for you:
import re
from urllib.parse import urljoin
PATTERN = r"^([\S]+)window.open\('([\S]+)'"
links = ["http://digesto.asamblea.gob.ni/consultas/coleccion/window.open('/consultas/util/pdf.php?type=rdd&rdd=p2%2FHzlqau8A%3D');return false;"]
result = "http://digesto.asamblea.gob.ni/consultas/util/pdf.php?type=rdd&rdd=p2%2FHzlqau8A%3D"
for link in links:
m = re.match(PATTERN, link, re.MULTILINE).groups()
# m is now: ('http://digesto.asamblea.gob.ni/consultas/coleccion/', '/consultas/util/pdf.php?type=rdd&rdd=p2%2FHzlqau8A%3D')
if len(m) == 2:
newLink = urljoin(*m)
print(newLink)
assert newLink == result
Returns:
http://digesto.asamblea.gob.ni/consultas/util/pdf.php?type=rdd&rdd=p2%2FHzlqau8A%3D
To that you can use regular expression:
Consider this code:
import re
out = list()
lst = ["http://digesto.asamblea.gob.ni/consultas/coleccion/window.open('/consultas/util/pdf.php?type=rdd&rdd=p2%2FHzlqau8A%3D');return false;", "http://digesto.asamblea.gob.ni/consultas/coleccion/window.open('/consultas/util/pdf.php?type=rdd&rdd=Z%2FgLeZxynkg%3D');return false;", "http://digesto.asamblea.gob.ni/consultas/coleccion/window.open('/consultas/util/pdf.php?type=rdd&rdd=9rka%2BmYwvYM%3D');return false;"]
for el in lst:
temp = re.sub(r"(.*?)/window.open\('(.*?)'\).*", r"\1\2", el)
out.append(temp)
print(temp)
The function sub allows to replace part of strings matching the pattern specified. Basically it is telling:
(.*?): keeps all the characters before /window.open...
/window.open\( the input string must have the pattern /window.open( but it will not be kept
(.*?) keep all characters after the previous pattern until a ) is found (represented by \()
If I'm looking for the keyword "sales" and I want to get the nearest "http://www.somewebsite.com" even if there is multiple links in the file. I want the nearest link not the first link. This means I need to search for the link that comes just before the keyword match.
This doesn't work...
regex = (http|https)://[-A-Za-z0-9./]+.*(?!((http|https)://[-A-Za-z0-9./]+))sales
sales
Whats the best way to find a link that is closest to a keyword?
It is generally much easier and more robust to use an HTML parser rather than regex.
Using the third-party module lxml:
import lxml.html as LH
content = '''<html>
<p>other stuff</p><p>sales</p>
</html>
'''
doc = LH.fromstring(content)
for url in doc.xpath('''
//*[contains(text(),"sales")]
/preceding::*[starts-with(#href,"http")][1]/#href'''):
print(url)
yields
http://www.somewebsite.com
I find lxml (and XPath) a convenient way to express what elements I'm looking for. However, if installing a third-party module is not an option, you could also accomplish this particular job with HTMLParser from the standard library:
import HTMLParser
import contextlib
class MyParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.last_link = None
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if 'href' in attrs:
self.last_link = attrs['href']
content = '''<html>
<p>other stuff</p><p>sales</p>
</html>
'''
idx = content.find('sales')
with contextlib.closing(MyParser()) as parser:
parser.feed(content[:idx])
print(parser.last_link)
Regarding the XPath used in the lxml solution: The XPath has the following meaning:
//* # Find all elements
[contains(text(),"sales")] # whose text content contains "sales"
/preceding::* # search the preceding elements
[starts-with(#href,"http")] # such that it has an href attribute that starts with "http"
[1] # select the first such <a> tag only
/#href # return the value of the href attribute
I don't think you can do this one with regex alone (especially looking before the keyword match) as it has no sense of comparing distances.
I think you're best off doing something like this:
find all occurences of sales & get substring index, called salesIndex
find all occurences of https?://[-A-Za-z0-9./]+ and get the substring index, called urlIndex
loop through salesIndex. For each location i in salesIndex, find the urlIndex closest.
Depending on how you want to judge "closest" you may need to get the start and end indices of the sales and http... occurences to compare. i.e., find the end index of a URL that is closest to the start index of the current occurence of sales, and find the start index of a URL that is closest to the end index of the current occurence of sales, and pick the one that is closer.
You can use matches = re.finditer(pattern,string,re.IGNORECASE) to get a list of matches, and then match.span() to get the start/end substring indices for each match in matches.
Building on what mathematical.coffee suggested, you could try something along these lines:
import re
myString = "" ## the string you want to search
link_matches = re.finditer('(http|https)://[-A-Za-z0-9./]+',myString,re.IGNORECASE)
sales_matches = re.finditer('sales',myString,re.IGNORECASE)
link_locations = []
for match in link_matches:
link_locations.append([match.span(),match.group()])
for match in sales_matches:
match_loc = match.span()
distances = []
for link_loc in link_locations:
if match_loc[0] > link_loc[0][1]: ## if the link is behind your keyword
## append the distance between the END of the keyword and the START of the link
distances.append(match_loc[0] - link_loc[0][1])
else:
## append the distance between the END of the link and the START of the keyword
distances.append(link_loc[0][0] - match_loc[1])
for d in range(0,len(distances)-1):
if distances[d] == min(distances):
print ("Closest Link: " + link_locations[d][1] + "\n")
break
I tested out this code and it seems to be working...
def closesturl(keyword, website):
keylist = []
urllist = []
closest = []
urls = []
urlregex = "(http|https)://[-A-Za-z0-9\\./]+"
urlmatches = re.finditer(urlregex, website, re.IGNORECASE)
keymatches = re.finditer(keyword, website, re.IGNORECASE)
for n in keymatches:
keylist.append([n.start(), n.end()])
if(len(keylist) > 0):
for m in urlmatches:
urllist.append([m.start(), m.end()])
if((len(keylist) > 0) and (len(urllist) > 0)):
for i in range (0, len(keylist)):
closest.append([abs(urllist[0][0]-keylist[i][0])])
urls.append(website[urllist[0][0]:urllist[0][1]])
if(len(urllist) >= 1):
for j in range (1, len(urllist)):
if((abs(urllist[j][0]-keylist[i][0]) < closest[i])):
closest[i] = abs(keylist[i][0]-urllist[j][0])
urls[i] = website[urllist[j][0]:urllist[j][1]]
if((abs(urllist[j][0]-keylist[i][0]) > closest[i])):
break # local minimum / inflection point break from url list
if((len(keylist) > 0) and (len(urllist) > 0)):
return urls #return website[urllist[index[0]][0]:urllist[index[0]][1]]
else:
return ""
somestring = "hey whats up... http://www.firstlink.com some other test http://www.secondlink.com then mykeyword"
keyword = "mykeyword"
print closesturl(keyword, somestring)
The above when run shows... http://www.secondlink.com.
If someones got ideas on how to speed up this code that would be awesome!
Thanks
V$H.