Regex: Sentence Beginning With Keyword Ending with Sentence Blank Output - python

If I was to print "m," there would be a result that begins with "Histology" and ends with a period. Despite that, the output shows up empty.
from bs4 import BeautifulSoup
from googlesearch import search
import requests
from goose3 import Goose
def search_google(query):
parent_=[]
for j in search(query, tld="co.in", num=10, stop=5, pause=2):
child_=[]
link_=j
site_name=link_.split("/")[2]
child_.append(site_name)
child_.append(link_)
parent_.append(child_)
g = Goose()
article = g.extract(link_)
m = article.cleaned_text
Answer = re.findall(r'\bHistology\s+([^.]*)',m)
print(Answer)
f = search_google("""'Histology'""")
Output: []

It seems your answer variable has incorrect indentation, and your last result has no matches in the cleaned text. This is why your print results in a empty list.
The print command, since it sits outside of the loop only triggers once. And given the final value of Answer has no matches, you are returned an empty list.
Indent the answer variable by 1 and it should output the correct result.
Your regex will also only match the sentence following Histology and not include the word itself. This is due to you specifying a capture group without Histology included. You can resolve this by removing the capturing group.
r'\bHistology\s+[^.]*'
from bs4 import BeautifulSoup
from googlesearch import search
import requests
from goose3 import Goose
def search_google(query):
parent_=[]
for j in search(query, tld="co.in", num=10, stop=5, pause=2):
child_=[]
link_=j
site_name=link_.split("/")[2]
child_.append(site_name)
child_.append(link_)
parent_.append(child_)
g = Goose()
article = g.extract(link_)
m = article.cleaned_text
Answer = re.findall(r'\bHistology\s+[^.]*',m)
print(Answer)
f = search_google("""'Histology'""")
To print all results on individual lines you can change print(Answer) to print('\n'.join(Answer))

Related

Python - Returning String with recursion

The project that I am doing requires us to input a url and then follow the link in a particular position a number of number of times then return the last page visited. I have found the solution with a while loop, but now I am trying to do it with recursion.
Example: http://py4e-data.dr-chuck.net/known_by_Fikret.html
Find the link at position 3 (the first name is 1). Follow that link. Repeat this process 4 times. The answer is the last name that you retrieve.
Sequence of names: Fikret Montgomery Mhairade Butchi Anayah
Last name in sequence: Anayah
My code is this so far:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re
cnt = input("Enter count:")
post = input("Enter position:")
url = "http://py4e-data.dr-chuck.net/known_by_Fikret.html"
count = int(cnt)
position = int(post)
def GetLinks(initalPage, position, count):
html = urlopen(initalPage).read()
soup = BeautifulSoup(html, "html.parser")
temp = ""
links = list()
tags = soup('a')
for tag in tags:
x = tag.get('href', None)
links.append(x)
print(links[position - 1])
if count > 1:
GetLinks(links[position-1], position, count - 1)
return links[position - 1]
y = GetLinks(url, position, count)
print("****", y)
I see two problems with my code.
I am creating a list that is expanding with every recursion, which makes it very hard to locate the proper value.
Second, I am obviously returning the wrong item.
I don't know exactly how to fix this.

getting specific part from a page source python

I am trying to extract a specific part from a page using regex but it isn't working.
This is the part I want to be extracted from the page:
{"clickTrackingParams":"CPcBEJhNIhMIwrDVo4qw3gIVTBnVCh28iAtzKPgd","commandMetadata":{"webCommandMetadata":{"url":"/service_ajax","sendPost":true}},"performCommentActionEndpoint":{"action":"CAUQAhoaVWd4MEdWUGNadTdvclcwT09WdDRBYUFCQWcqC1pNZlAzaERwdjlBMAA4AEoVMTA1MTc3MTgyMDc5MDg5MzQ1ODM4UACKAVQSC1pNZlAzaERwdjlBMixlaHBWWjNnd1IxWlFZMXAxTjI5eVZ6QlBUMVowTkVGaFFVSkJadyUzRCUzRMABAMgBAOABAaICDSj___________8BQAA%3D","clientActions":[{"updateCommentVoteAction":{"voteCount":{"accessibility":{"accessibilityData":{"label":"80 likes"}},"simpleText":"80"},"voteStatus":"LIKE"}}]}}
So far I've tried this :
import requests
import re
r = requests.get('http://rophoto.es/ash.txt')
html_source = r.text
mystrx = re.search(r'^{"clickTrackingParams".*"voteStatus":"LIKE"}}]}}', html_source)
but it didn't work out for me.
Try this:
import requests
import re
r = requests.get('http://rophoto.es/ash.txt')
html_source = r.text
fst, snd = '{"clickTrackingParams":', '"voteStatus":"LIKE"}}]}}'
# Find first occurence
end = html_source.find(snd)
# Get closest index
start = max(idx.start() for idx in re.finditer(fst, html_source) if idx.start() < end)
print(html_source[start:end+len(snd)])
Which Outputs:
{"clickTrackingParams":"CPcBEJhNIhMIwrDVo4qw3gIVTBnVCh28iAtzKPgd","commandMetadata":{"webCommandMetadata":{"url":"/service_ajax","sendPost":true}},"performCommentActionEndpoint":{"action":"CAUQAhoaVWd4MEdWUGNadTdvclcwT09WdDRBYUFCQWcqC1pNZlAzaERwdjlBMAA4AEoVMTA1MTc3MTgyMDc5MDg5MzQ1ODM4UACKAVQSC1pNZlAzaERwdjlBMixlaHBWWjNnd1IxWlFZMXAxTjI5eVZ6QlBUMVowTkVGaFFVSkJadyUzRCUzRMABAMgBAOABAaICDSj___________8BQAA%3D","clientActions":[{"updateCommentVoteAction":{"voteCount":{"accessibility":{"accessibilityData":{"label":"80 likes"}},"simpleText":"80"},"voteStatus":"LIKE"}}]}}
If you want to get the second occurence, you can try something along the lines of:
import requests
import re
r = requests.get('http://rophoto.es/ash.txt')
html_source = r.text
fst, snd = '{"clickTrackingParams":', '"voteStatus":"LIKE"}}]}}'
def find_nth(string, to_find, n):
"""
Finds nth match from string
"""
# find all occurences
matches = [idx.start() for idx in re.finditer(to_find, string)]
# return nth match
return matches[n]
# finds second match
end = find_nth(html_source, snd, 1)
# Gets closest index to end
start = max(idx.start() for idx in re.finditer(fst, html_source) if idx.start() < end)
print(html_source[start:end+len(snd)])
Note: In the second example you can run into IndexError's if you request an occurence outside of the found matches. You will need to handle this behaviour yourself.

How can I group it by using "search" function in regular expression?

I have been developing a python web-crawler to collect the used car stock data from this website. (http://www.bobaedream.co.kr/cyber/CyberCar.php?gubun=I&page=20)
First of all, I would like to collect only "BMW" from the list. So, I used "search" function in regular expression like the code below. But, it keeps returning "None".
Is there anything wrong in my code?
Please give me some advice.
Thanks.
from bs4 import BeautifulSoup
import urllib.request
import re
CAR_PAGE_TEMPLATE = "http://www.bobaedream.co.kr/cyber/CyberCar.php?gubun=I&page="
def fetch_post_list():
for i in range(20,21):
URL = CAR_PAGE_TEMPLATE + str(i)
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', class_='cyber')
print ("Page#", i)
# 50 lists per each page
lists=table.find_all('tr', itemtype="http://schema.org/Article")
count=0
r=re.compile("[BMW]")
for lst in lists:
if lst.find_all('td')[3].find('em').text:
lst_price=lst.find_all('td')[3].find('em').text
lst_title=lst.find_all('td')[1].find('a').text
lst_link = lst.find_all('td')[1].find('a')['href']
lst_photo_url=''
if lst.find_all('td')[0].find('img'):
lst_photo_url = lst.find_all('td')[0].find('img')['src']
count+=1
else: continue
print('#',count, lst_title, r.search("lst_title"))
return lst_link
fetch_post_list()
r.search("lst_title")
This is searching inside the string literal "lst_title", not the variable named lst_title, that's why it never matches.
r=re.compile("[BMW]")
The square brackets indicate that you're looking for one of those characters. So, for example, any string containing M will match. You just want "BMW". In fact you don't even need regular expressions, you can just test:
"BMW" in lst_title

For Loop doesn't spit out needed results

I got this piece of code to spit out the unique "area number" in the URL. However, the loop doesn't work. It spits out the same number, please see below:
import urllib3
from bs4 import BeautifulSoup
http = urllib3.PoolManager()
url = open('MS Type 1 URL.txt',encoding='utf-8-sig')
links = []
for link in url:
y = link.strip()
links.append(y)
url.close()
print('Amount of Links: ', len(links))
for x in links:
j = (x.find("=") + 1)
g = (x.find('&housing'))
print(link[j:g])
Results are:
http://millersamuel.com/aggy-data/home/query_report?area=38&housing_type=3&measure=4&query_type=quarterly&region=1&year_end=2020&year_start=1980
23
http://millersamuel.com/aggy-data/home/query_report?area=23&housing_type=1&measure=4&query_type=annual&region=1&year_end=2020&year_start=1980
23
As you can see it spits out the area number '23' which is only in one of this URL but not the '38' of the other URL.
There's a typo in your code. You iterate over links list and bind its elements to x variable, but print a slice of link variable, so you get the same string printed on each loop iteration. So you can change print(link[j:g]) to print(x[j:g]), but it's better to call your variables with more descriptive names, so here's the fixed version of your loop:
for link in links:
j = link.find('=') + 1
g = link.find('&housing')
print(link[j:g])
And I also want to show you a proper way to extract area value from URLs:
from urllib.parse import urlparse, parse_qs
url = 'http://millersamuel.com/aggy-data/home/query_report?area=38&housing_type=3&measure=4&query_type=quarterly&region=1&year_end=2020&year_start=1980'
area = parse_qs(urlparse(url).query)['area'][0]
So instead of using str.find method, you can write this:
for url in urls:
parsed_qs = parse_qs(urlparse(url).query)
if 'area' in parsed_qs:
area = parsed_qs['area'][0]
print(area)
Used functions:
urllib.urlparse
urllib.parse_qs
You need to change:
print(link[j:g]) to print(x[j:g])

Different outputs in removing characters from a list - Python

I'm doing some web scraping in python and I want to delete the element "." from each element of a list. I have two approaches, but just one gives the correct output. The code is above.
import urllib2
from bs4 import BeautifulSoup
first=urllib2.urlopen("http://www.admision.unmsm.edu.pe/res20130914/A.html").read()
soup=BeautifulSoup(first)
w=[]
for q in soup.find_all('tr'):
for link in q.find_all('a'):
w.append(link["href"])
s = [ i.replace(".","") for i in w ]
l=[]
for t in w:
l=t.replace(".","")
If I run print s , the output is the right output , but if I run print l, it isn't.
I would like to know why s gives the correct ouput and l doesn't.
In the loop, you replace the whole list in each iteration, instead of appending to it as in the single line example.
Instead, try:
for t in w:
l.append(t.replace(".",""))
You are replacing the list each time and it'e getting overwritten. As a result, you are getting the last element after the iterations! Hope, it helps!
import urllib2
from bs4 import BeautifulSoup
first=urllib2.urlopen("http://www.admision.unmsm.edu.pe/res20130914/A.html").read()
soup=BeautifulSoup(first)
w=[]
for q in soup.find_all('tr'):
for link in q.find_all('a'):
w.append(link["href"])
s = [ i.replace(".","") for i in w ]
print s
l=[]
for t in w:
l.append(t.replace(".",""))
print l
Cheers!

Categories

Resources