pagination : why it's still run while the page is not match? - python

I want to scrape data from a website, but first I want to get the page with pagination. Here I'm using python as a program language, and I already got this code. But when I run it, it doesn't work properly. the result must be stopped when response.url didn't match with expected_url. Is there someone know how to solve it? Please help, thank you.
Here is the code :
from bs4 import BeautifulSoup
import urllib.request
count = 0
url = "http://www.belanjamimo.net/foundation-bb-cream/?o=a&s=%d"
def get_url(url):
req = urllib.request.Request(url)
return urllib.request.urlopen(req)
expected_url = url % count
response = get_url(expected_url)
while (response.url == expected_url):
print("GET {0}".format(expected_url))
count += 9
expected_url = url % count
response = get_url(expected_url)

Try the below approach to exhaust all the items in different pages and break out of loop when there is no more items available.
from bs4 import BeautifulSoup
import requests
url = "http://www.belanjamimo.net/foundation-bb-cream/?o=a&s={}"
page = 0
while True:
res = requests.get(url.format(page))
soup = BeautifulSoup(res.text,"lxml")
items = soup.select(".product-block h2 a")
if len(items)<=1:break #check out if there is any product still available
for item in items:
print(item.text)
page+=9

Related

Python web scraping script does not find element by css selector

I'm trying to get this web scraper to get current electricity price from this website, it's in finnish but it's right under "Hinta nyt". https://sahko.tk/
Here's my code:
import requests
from bs4 import BeautifulSoup
url = "https://sahko.tk/"
element_selector = ""
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
elements = soup.find_all(element_selector)
if len(elements) == 0:
print("No element found with selector '%s'" % element_selector)
else:
element_text = elements[0].text
print(element_text)
I left the element_selector to empty because what ever I tried it just did not work. I'm not even sure if I'm on the right tracks.
The data you see is embedded inside <script> in that page. To parse the current price you can use next example:
import re
import json
import requests
url = "https://sahko.tk/"
data = requests.get(url).text
data = re.search(r"function prices_today\(\)\{var t= (.*?});", data).group(1)
data = json.loads(data)
print("Hinta nyt", data["now"], "snt/kWh")
Prints:
Hinta nyt 33.27 snt/kWh

Could I ask how do I know how many pages are in the website? (Web scrapping)

I get a website (https://www.diabetesdaily.com/forum/threads/had-a-friend-with-type-one.136015/)
And I found there are many forum pages
Want to use the for loop for web scrapping, therefore could I ask, how I get the maximum number of forum pages on this page by BeaurifulSoup?
Many thanks.
You can try something like this:
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.diabetesdaily.com/forum/threads/had-a-friend-with-type-one.136015/"
req = requests.get(url)
soup = bs(req.content, 'html.parser')
navs = soup.find("ul", { "class" : "pageNav-main" }).find_all("li", recursive=False)
print(navs)
print(f'Length: {len(navs)}')
Result
[<li class="pageNav-page pageNav-page--current">1</li>, <li class="pageNav-page pageNav-page--later">2</li>, <li class="pageNav-page pageNav-page--later">3</li>, <li class="pageNav-page">4</li>]
Length: 4
You don't need BeautifulSoup to count the number of pages.
URL of page 1 : https://www.diabetesdaily.com/forum/threads/had-a-friend-with-type-one.136015/page-1
URL of page 2 : https://www.diabetesdaily.com/forum/threads/had-a-friend-with-type-one.136015/page-2
URL of page 3 : https://www.diabetesdaily.com/forum/threads/had-a-friend-with-type-one.136015/page-3
And so on...
So you need to increment the value X in https://www.diabetesdaily.com/forum/threads/had-a-friend-with-type-one.136015/page-X to move to the next page. You can then check the status code of the response and the page title to ensure that we are not visiting the same page twice.
import requests
import re
def getPageTitle(response_text):
d = re.search('<\W*title\W*(.*)</title', response_text, re.IGNORECASE)
return d.group(1)
def count_pages():
count = 0
uniquePages = set()
while(True):
count += 1
url = ('https://www.diabetesdaily.com/forum/threads/' +
f'had-a-friend-with-type-one.136015/page-{count}')
response = requests.get(url)
title = getPageTitle(response.text)
if title in uniquePages or response.status_code != 200:
break
uniquePages.add(title)
return len(uniquePages)
print(count_pages()) # 4

scraping links from wikipedia

So i am trying to scrape links from a random wikipedia page here is my code thus far:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib2
# function get random page
def get_random():
import requests
# r = requests.get('https://en.wikipedia.org/wiki/Special:Random')
r = requests.get('https://en.wikipedia.org/wiki/Carole_Ann')
return r.url
#========================
#finding the valid link
def validlink(href):
if href:
if re.compile('^/wiki/').search(href):
if not re.compile('/\w+:').search(href):
return True
return False
#validlink()===========
#the first site
a1 = get_random()
#print("the first site is: " + a1)
# the first site end()====
#looking for the article name:
blin = requests.get(a1)
soup = BeautifulSoup(blin.text, 'html.parser')
title = soup.find('h1', {'class' : 'firstHeading'})
print("starting website: " + a1 + " Titled: " + title.text)
print("")
#=============================
#first article done
#find body:
import re
body = requests.get(a1).text
soup = BeautifulSoup(body, 'lxml')
for link in soup.findAll("a"):
url = link.get("href", "")
print(
#======================
i know i'm doing this last part wrong. Im new to python so i just have no idea how to go about this part, what i need is to pull all of the links from a random site that the random page takes me to, then i pull the link and title off of that site,
then i need to pull the wikipedia links off of that page which is what i am looking to do in that last bit of code there heres another snip:
and at this point i want to print all of the links that it finds after they have been tested against my valid links function at the top:
again forgive me for being new and not understanding at this. But please help i cannot figure this out.
so the question that i have is: i need to create a snippet of code that will pull out all of the website links off of the wikipedia page (which note i still dont know how to do the for loop was my best guess based on my own research) then i need to test the links that i pulled against my validlink function, and print out all of the valid links.
If you whan it as list then create new list and append() url if it is valid.
Because the same url can be many times on page so I also check if url is already on list.
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and validlink(url):
valid_urls.append(url)
print(valid_urls)
from bs4 import BeautifulSoup
import requests
import re
# --- functions ---
def is_valid(url):
"""finding the valid link"""
if url:
if url.startswith('/wiki/'): # you don't need `re` to check it
if not re.compile('/\w+:').search(url):
return True
return False
# --- main ---
#random_url = 'https://en.wikipedia.org/wiki/Special:Random'
random_url = 'https://en.wikipedia.org/wiki/Carole_Ann'
r = requests.get(random_url)
print('url:', r.url)
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find('h1', {'class': 'firstHeading'})
print('starting website:', r.url)
print('titled:', title.text)
print()
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and is_valid(url):
valid_urls.append(url)
#print(valid_urls)
#for url in valid_urls:
# print(url)
print('\n'.join(valid_urls))

How to handle Empty List- multiple page web scraping

I am trying to pull the question and answer section from Lazada through web scraping, however I am having issue when some of the pages doesn't have any question/answer. My code returns nothing when i run it for multiple web pages but works only for one page that have question and answer.
How do i make the code continue reading the rest of web pages though the first page have no question?
I have tried adding if else statement in my code as shown below.
import bleach
import csv
import datetime
from bs4 import BeautifulSoup
urls = ['url1','url2','url3']
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
now = datetime.datetime.now()
print ("Date data being pulled:")
print str(now)
print ("")
nameList = soup.findAll("div", {"class":"qna-content"})
for name in nameList:
if nameList == None:
print('None')
else:
print(name.get_text())
continue
my expected output will be something like as shown below :
None --> output from url1
None --> output from url2
can choose huzelnut?
Hi Dear Customer , for the latest expiry date its on 2019 , and we will make sure the expiry date is still more than 6 months.--> output from url3
I appreciate your help, thanks in advance!
you have wrong syntax, put if nameList == None: outside the loop, also you need to fix the indentation.
urls = ['url1','url2','url3']
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
now = datetime.datetime.now()
print ("Date data being pulled:")
print str(now)
print ("")
nameList = soup.findAll("div", {"class":"qna-content"})
if nameList == None:
print(url, 'None')
continue # skip this URL
for name in nameList:
print(name.get_text())
I did some changes to the logic of the code and manage to print the record for now, since I am still learning, hope to get sharing for others as well if you have alternative/better solution.
import datetime
from bs4 import BeautifulSoup
import requests
urls = ['url1','url2','url3']
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
now = datetime.datetime.now()
print ("Date data being pulled:")
print str(now)
print ("")
qna = []
qna = soup.findAll("div", class_= "qna-content")
for qnaqna in qna:
if not qnaqna:
print('List is empty')
else:
print(qnaqna.get_text())
continue

Web Scraping with Python (city) as parameter

def findWeather(city):
import urllib
connection = urllib.urlopen("http://www.canoe.ca/Weather/World.html")
rate = connection.read()
connection.close()
currentLoc = rate.find(city)
curr = rate.find("currentDegree")
temploc = rate.find("</span>", curr)
tempstart = rate.rfind(">", 0, temploc)
print "current temp:", rate[tempstart+1:temploc]
The link is provided above. The issue I have is everytime I run the program and use, say "Brussels" in Belgium, as the parameter, i.e findWeather("Brussels"), it will always print 24c as the temperature whereas (as I am writing this) it should be 19c. This is the case for many other cities provided by the site. Help on this code would be appreciated.
Thanks!
This one should work:
import requests
from bs4 import BeautifulSoup
url = 'http://www.canoe.ca/Weather/World.html'
response = requests.get(url)
# Get the text of the contents
html_content = response.text
# Convert the html content into a beautiful soup object
soup = BeautifulSoup(html_content, 'lxml')
cities = soup.find_all("span", class_="titleText")
cels = soup.find_all("span", class_="currentDegree")
for x,y in zip(cities,cels):
print (x.text,y.text)

Categories

Resources