BeautifulSoup scrap only few positions - python

I am learning scraping.
Few month ago I did scraping for learning purpose website with house prices. It is on github:
https://github.com/MariuszTP/Beautifulsoup
Now when I try to scrap it, it is returning only few positions.
I simplifiy the code:
url = "https://www.otodom.pl/pl/oferty/sprzedaz/mieszkanie/poznan?distanceRadius=0&market=ALL&locations=%5Bcities_6-1%5D&viewType=listing&lang=pl&searchingCriteria=sprzedaz&searchingCriteria=mieszkanie&searchingCriteria=cala-polska"
html_text = requests.get( url ).text
soup = BeautifulSoup(html_text, 'lxml')
houses = soup.find_all('article', class_ = 'css-n8rq67 es62z2j16')
for i in houses:
print(i.text)
Can somebody tell me please what is the problem and how to solve it?

Today most of the websites not have the data in the HTML directly. It means that BeautifulSoup cannot work with dynamic websites. (Selenium is the solution)
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
options = Options()
# options.add_argument('--disable-blink-features=AutomationControlled')
service = ChromeService(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
URL = 'https://www.otodom.pl/pl/oferty/sprzedaz/mieszkanie/poznan?distanceRadius=0&market=ALL&locations=%5Bcities_6-1%5D&viewType=listing&lang=pl&searchingCriteria=sprzedaz&searchingCriteria=mieszkanie&searchingCriteria=cala-polska'
driver.get(URL)
houses = driver.find_elements(By.XPATH, "//article[#class='css-n8rq67 es62z2j16']")
for i in houses:
print(i.text)
print(len(houses))
driver.quit()

Related

Parsing web-page with search bar

I need to parse store names (<div class="LocationName">) from https://www.comicshoplocator.com/StoreLocator.
The thing is -- when you entering zip code (for instance 73533) in search it does not appear in the URL.
Because of that python can't see elements on page.
Here is my code snippet. I am receiving no output, because of that.
How to make python see input with zip code?
Thanks
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
browser = webdriver.Firefox(executable_path=r'C:\Geckodriver\Geckodriver.exe')
browser.get('https://www.comicshoplocator.com/StoreLocator')
browser.find_element(By.NAME, 'query').send_keys('73533' + Keys.RETURN)
html = browser.page_source
soup = BeautifulSoup(html, features="html.parser")
for tag in soup.find_all('div', class_="LocationName"):
print(tag.text)
The problem is in here: browser.find_element(By.NAME, 'query').send_keys('73533' + Keys.RETURN)
The correct one would be:
search = browser.find_element(By.NAME, 'query')
search.send_keys('73533')
search.send_keys(Keys.RETURN)
Full working code:
I use chrome driver you can change that portion with no times
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.comicshoplocator.com/StoreLocator')
driver.maximize_window()
time.sleep(2)
d=driver.find_element(By.NAME, 'query')
d.send_keys('73533')
d.send_keys(Keys.ENTER)
soup = BeautifulSoup(driver.page_source, 'lxml')
for tag in soup.find_all('div', class_="LocationName"):
print(tag.text)
Output:
MARK DOWN COMICS
WWW.DCBSERVICE.COM
Well, actually, this can be done with requests, there's no need to use Selenium. You can send a post request to:
https://www.comicshoplocator.com/StoreLocator
import re
import requests
from bs4 import BeautifulSoup
data = {
"showAll": "false",
"showCsls": "true",
"query": "73533",
}
response = requests.post(
"https://www.comicshoplocator.com/StoreLocator",
data=data,
)
soup = BeautifulSoup(response.text, "html.parser")
string = soup.select_one("script:-soup-contains('address')").string
unformatted_data = re.search(r"\(({.*?})\)", string, re.DOTALL).group(1)
# remove all the whitespace
formatted_data = re.sub(r"\s+", "", unformatted_data)
print(formatted_data)
Prints:
{storeno:"8816",lat:"41.0671081542969",lng:"-85.1372680664063",name:"WWW.DCBSERVICE.COM",address:"6005ESHELBYDR",address2:"WWW.DCBSERVICE.COM",city:"MEMPHIS",state:"TN",zip:"38141",phone:"",hasProfile:"True",storeLogo:'/Image/CslsLogo/'+"8816"}
This code worked for me:
listings = browser.find_elements(By.CLASS_NAME, 'CslsLocationItem')
for listing in listings:
print(listing.find_element(By.CLASS_NAME,'LocationName').get_attribute('innerText'))

AttributeError: 'NoneType' object has no attribute 'find' Web Scraping Python

I am working on an office project to get data to check active status on different websites but whenever I want to get data sometimes it shows none and sometimes it shows this Attribute error, I follow youtube videos steps but still get this error. help, please.
//Python Code
from bs4 import BeautifulSoup
import requests
html_text = requests.get(
"https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw").text
soup = BeautifulSoup(html_text, 'lxml')
status = soup.find('div', {'class': "ValidatorInfo_statusBadge__PBIGr"})
para = status.find('p').text
print(para)
The url is dynamic meaning data is populated by javascript. So you need automation tool something like selenium.
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
url = 'https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
time.sleep(8)
driver.get(url)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, 'lxml')
#driver.close()
status = soup.find('div', {'class': "ValidatorInfo_statusBadge__PBIGr"})
para = status.find('p').text
print(para)
Output:
Active
You have the most common problem - modern pages use JavaScript to add elements but requests/BeautifulSoup can't run JavaScript.
So soup.find('div',...) gives None instead expected element and later it makes problem with None.find('p')
You may use Selenium to control real web browser which can run JavaScript.
from selenium import webdriver
#from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import NoSuchElementException, TimeoutException
#from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
url = "https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw"
#driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
driver.get(url)
#status = driver.find_element(By.XPATH, '//div[#class="ValidatorInfo_statusBadge__PBIGr"]')
wait = WebDriverWait(driver, 10)
status = wait.until(EC.visibility_of_element_located((By.XPATH, '//div[#class="ValidatorInfo_statusBadge__PBIGr"]')))
print(status.text)
Eventually you should check if page gives some API to get data.
You may also use DevTools (tab: Network) to check if JavaScript reads data from some URL and you may try to use this URL with requests. It could work faster than with Selenium but server may detect script/bot and block it.
JavaScript usually get data as JSON so it may not need to scrape HTML with BeautifulSoup

Python BSoup doesn't find hidden elements, problems with Selenium as alternative

Hey I asked a similar question before and from what I've learned BSoup doesn't find what I'm searching for because the element is hidden.
For context I find FancyCompLabel when I have the cursor above Rathberger and examine the html code
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://www.techpilot.de/zulieferer-suchen?laserschneiden%202d%20(laserstrahlschneiden)"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
cclass=soup.find("div",class_="fancyCompLabel")
print(cclass)
It seems like Selenium could be a fix but I have really big problems implementing it.
heres what i tried:
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
# Set some Selenium Options
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Webdriver
wd = webdriver.Chrome('chromedriver',options=options)
# URL
url = 'https://www.techpilot.de/zulieferer-suchen?laserschneiden%202d%20(laserstrahlschneiden)'
# Load URL
wd.get(url)
# Get HTML
soup = BeautifulSoup(wd.page_source, 'html.parser')
cclass=soup.find_all("div",class_="fancyCompLabel")
print(cclass)
1 There are two "Accept cookies" modals, one of them is inside an iframe. You need to accept both and to stitch the the iframe of the seconds one.
2 You will need for your elements to become present in DOM after you accept the cookies.
I've managed to accomplish this with Selenium.
Solution
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
# Set some Selenium Options
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Webdriver
wd = webdriver.Chrome(executable_path='/snap/bin/chromium.chromedriver', options=options)
# URL
url = 'https://www.techpilot.de/zulieferer-suchen?laserschneiden%202d%20(laserstrahlschneiden)'
# Load URL
wd.get(url)
# Get HTML
soup = BeautifulSoup(wd.page_source, 'html.parser')
wait = WebDriverWait(wd, 15)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#bodyJSP #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#efficientSearchIframe")))
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".hideFunctionalScrollbar #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
#wd.switch_to.default_content() # you do not need to switch to default content because iframe is closed already
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".fancyCompLabel")))
results = wd.find_elements_by_css_selector(".fancyCompLabel")
for p in results:
print(p.text)
Result:
Rathberger GmbH
Deschberger Metall- und Blechbearbeit...
Anant GmbH
Gröbmer GmbH
Berma Plaatwerk BV
Punzonado y Láser METALKOR S.L.
Blankart AG - Werkzeugbau + Fertigung
Goodwill Precision Machinery (German ...
Bechtold GmbH
PMT GmbH

Scraping: cannot extract content from webpage

I am trying to scrape the news content from the following page, but with no success.
https://www.business-humanrights.org/en/latest-news/?&search=nike
I have tried with Beautifulsoup :
r = requests.get("https://www.business-humanrights.org/en/latest-news/?&search=nike")
soup = BeautifulSoup(r.content, 'lxml')
soup
but the content that I am looking for - the bits of news that are tagged as div class = 'card__content', do not appear in the soup output.
I also checked, but I could not find any frames to switch to.
Finally, I tried with phantomjs and the following code but with no success:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
url = "https://www.business-humanrights.org/en/latest-news/?&search=nike"
driver = webdriver.PhantomJS(executable_path= '~\Chromedriver\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get(url)
time.sleep(7)
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
container = soup.find_all('div', attrs={
'class':'card__content'})
print(container)
I am running out of options, anyone can help?
Use API
import requests
r = requests.get("https://www.business-humanrights.org/en/api/internal/explore/?format=json&search=nike")
print(r.json())
I didn't understand why you're facing this. I tried the same above but not with requests and bs4. I used requests_html. xpaths can be used directly in this library without any other libraries.
import requests_html
session = requests_html.HTMLSession()
URL = 'https://www.business-humanrights.org/en/latest-news/?&search=nike'
res = session.get(URL)
divs_with_required_class = res.html.xpath(r'//div[#class="card__content"]')
for item in divs_with_required_class:
print(f'Div {divs_with_required_class.index(item) + 1}:\n', item.text, end='\n\n')
driver.page_source returns initial HTML-doc content no matter how long you wait (time.sleep(7) has no effect).
Try below:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver.get(url)
cards = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[#class='card__content' and normalize-space(.)]")))
texts = [card.text for card in cards]
print(texts)
driver.quit()

How to scrape and extract links to n level and scrape the data again and map it to output in python?

I am learning web crawling and scraping in python. I want to scrape data where in a site there are links, and inside those links there are more links. So I want to scrape data till predefined level n.
This is my basic code
import requests
from selenium import webdriver
from requests_ntlm import HttpNtlmAuth
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from webdrivermanager import GeckoDriverManager
import pickle
from selenium.webdriver.common.keys import Keys
from urllib.parse import urljoin
from seleniumrequests import Chrome
options = Options()
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--profile-directory=Default")
driver = webdriver.Chrome("./chromedriver",options=options)
web_url = 'https://spaceflightnow.com/'
driver.get("https://spaceflightnow.com/")
time.sleep(5)
soup = BeautifulSoup(driver.page_source,"lxml")
#section = soup.section
links=[]
for url in soup.find_all('a',href=True):
links.append(urljoin(web_url,url.get('href')))
#print(urljoin(web_url,url.get('href')))
links = list(filter(lambda x: x != web_url,links))
print(links)
This prints multiple links of first page. Now I want to click and go to all the links in subsequent level and scrape it again,getting more links inside. There is the possibility of same links getting displayed again internally from news feed. So what I want to know is what should be my approach to do it. I can understand I need a tree, but cannot figure out exactly how ?
Like I create a list inside list, but how to do it dynamically till n level ? and how to map it with the data saved in file ?? Can anyone help me with this ? maybe with a sample solution ?
Thank you :)
I made example which work without recursion - I would say it is similar to Breadth-First Search algorithm.
It keeps urls on list [(url, level),...] to control level and in set() to filter visited page. It also filters links to external pages.
Tested with Firefox.
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# ---
def get_links(driver, url):
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source,"lxml")
links = []
for new_url in soup.find_all('a', href=True):
new_url = new_url.get('href')
new_url = urljoin(url, new_url)
links.append(new_url)
return links
# ---
options = Options()
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--profile-directory=Default")
driver = webdriver.Chrome("./chromedriver",options=options)
#driver = webdriver.Firefox()
# ---
domain = 'https://spaceflightnow.com/' # to filter external links
start_url = 'https://spaceflightnow.com/'
max_level = 2
links_visited = set([start_url]) # to test visited links
links_with_levels = [(start_url, 0)] # to control levels
# ---
for link, level in links_with_levels:
if level >= max_level:
print('skip:', level, link)
continue
print('visit:', level, link)
links = get_links(driver, link)
print('found:', len(links))
links = list(set(links) - links_visited)
print('after filtering:', len(links))
level += 1
for new_link in links:
if new_link.startswith(domain): # filter external links
links_visited.add(new_link)
links_with_levels.append( (new_link, level) )
# ---
for link, level in links_with_levels:
print('skip:', level, link)

Categories

Resources