Web crawler does not print - python

I'm working on my first web crawler and I cannot figure out how to get it to print results. There is not an error, but nothing displays.
from bs4 import BeautifulSoup
import urllib3
def extract_links():
http = urllib3.PoolManager()
r = http.request('GET', 'http://www.drankbank.com/happy-hour-chicago.html')
soup = BeautifulSoup(r, 'html.parser')
print(soup)
extract_links()
Thank you!

You are not accessing the data returned in the request.
soup = BeautifulSoup(r, 'html.parser')
should be:
soup = BeautifulSoup(r.data, 'html.parser')

Related

Scraping data from site

I've tried to scrape some data from a site using BeauitfulSoup, I've scraped some of the data successfully some others like (phone, website) I get errors with those data.
https://yellowpages.com.eg/en/search/spas/3231
this is the link to the site I try to scrape.
from bs4 import BeautifulSoup
import requests
url = 'https://yellowpages.com.eg/en/search/spas/3231'
r = requests.get(url)
soup =BeautifulSoup(r.content, 'lxml')
info = soup.find_all('div', class_='col-xs-12 padding_0')
for item in info:
phone = item.find('span', class_='phone-spans')
print(phone)
Every time I run this code the result is none.
Not sure where that code comes from but I couldn't see anything that looked similar, however this code works:
from bs4 import BeautifulSoup
import requests
url = 'https://yellowpages.com.eg/en/search/spas/3231'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
for item in soup.find_all('div', class_='searchResultsDiv'):
name = item.find('a',class_= 'companyName').text.strip()
phone = item.find('a',class_= 'search-call-mob')['href']
print(name,phone)

Webscraping Youtube pages

I'm trying to web scrape a youtube channel name via a link. But I get the error code:
title = response.find_all('div', class_= "style-scope ytd-channel-name")
AttributeError: 'Response' object has no attribute 'find_all'
Link to site: https://www.youtube.com/channel/UCHOgE8XeaCjlgvH0t01fVZg
Code:
url = 'https://www.youtube.com/channel/UCHOgE8XeaCjlgvH0t01fVZg'
response = requests.get(url)
title = response.find_all('div', class_= "style-scope ytd-channel-name")
soup = BeautifulSoup(title.text, 'lxml')
print(soup)
Thank you!
We can use this.
from requests_html import HTMLSession
from bs4 import BeautifulSoup as bs # importing BeautifulSoup
video_url = "https://www.youtube.com/channel/UCHOgE8XeaCjlgvH0t01fVZg"
# init an HTML Session
session = HTMLSession()
# get the html content
response = session.get(video_url)
# execute Java-script
response.html.render(sleep=1)
# create bs object to parse HTML
soup = bs(response.html.html, "html.parser")
name = soup.find('yt-formatted-string', class_='style-scope ytd-channel-name')
print(name.text)
Output:-
TheTekkitRealm
The following code returns the divs:
url = "https://www.youtube.com/channel/UCHOgE8XeaCjlgvH0t01fVZg"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
print(soup.div)
The value being returned can be changed through the 'soup.' value (e.g. soup.title).
I link to the documentation because I think it would be useful for you to look at too:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#

'NoneType' object is not callable in Beautiful Soup 4

I'm new-ish to python and started experimenting with Beautiful Soup 4. I tried writing code that would get all the links on one page then with those links repeat the prosses until I have an entire website parsed.
import bs4 as bs
import urllib.request as url
links_unclean = []
links_clean = []
soup = bs.BeautifulSoup(url.urlopen('https://pythonprogramming.net/parsememcparseface/').read(), 'html.parser')
for url in soup.find_all('a'):
print(url.get('href'))
links_unclean.append(url.get('href'))
for link in links_unclean:
if (link[:8] == 'https://'):
links_clean.append(link)
print(links_clean)
while True:
for link in links_clean:
soup = bs.BeautifulSoup(url.urlopen(link).read(), 'html.parser')
for url in soup.find_all('a'):
print(url.get('href'))
links_unclean.append(url.get('href'))
for link in links_unclean:
if (link[:8] == 'https://'):
links_clean.append(link)
links_clean = list(dict.fromkeys(links_clean))
input()
But I'm now getting this error:
'NoneType' object is not callable
line 20, in
soup = bs.BeautifulSoup(url.urlopen(link).read(),
'html.parser')
Can you pls help.
Be careful when importing modules as something. In this case, url on line 2 gets overridden in your for loop when you iterate.
Here is a shorter solution that will also give back only URLs containing https as part of the href attribute:
from bs4 import BeautifulSoup
from urllib.request import urlopen
content = urlopen('https://pythonprogramming.net/parsememcparseface/')
soup = BeautifulSoup(content, "html.parser")
base = soup.find('body')
for link in BeautifulSoup(str(base), "html.parser").findAll("a"):
if 'href' in link.attrs:
if 'https' in link['href']:
print(link['href'])
However, this paints an incomplete picture as not all links are captured because of errors on the page with HTML tags. May I recommend also the following alternative, which is very simple and works flawlessly in your scenario (note: you will need the package Requests-HTML):
from requests_html import HTML, HTMLSession
session = HTMLSession()
r = session.get('https://pythonprogramming.net/parsememcparseface/')
for link in r.html.absolute_links:
print(link)
This will output all URLs, including both those that reference other URLs on the same domain and those that are external websites.
I would consider using an attribute = value css selector and using the ^ operator to specify that the href attributes begin with https. You will then only have valid protocols. Also, use set comprehensions to ensure no duplicates and Session to re-use connection.
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
final = []
with requests.Session() as s:
r = s.get('https://pythonprogramming.net/parsememcparseface/')
soup = bs(r.content, 'lxml')
httpsLinks = {item['href'] for item in soup.select('[href^=https]')}
for link in httpsLinks:
r = s.get(link)
soup = bs(r.content, 'lxml')
newHttpsLinks = [item['href'] for item in soup.select('[href^=https]')]
final.append(newHttpsLinks)
tidyList = list({item for sublist in final for item in sublist})
df = pd.DataFrame(tidyList)
print(df)

Python3 : BeautifulSoup4 not returning expected value

I'm currently trying to scrap some data over a website using BS4 under python 3.6.4 but the value returned is not what I am expecting:
import requests
from bs4 import BeautifulSoup
link = "https://www.lacentrale.fr/listing?makesModelsCommercialNames=FERRARI&sortBy=priceAsc"
request = requests.get(link)
page = request.content
soup = BeautifulSoup(page, "html5lib")
price = soup.find("div", {"class" : "fieldPrice sizeC"}).text
print(price)
I should get "39 900 €" but the code return "47 880 â¬".
NB: Even without JS, the data should be "39 900 €".
Thanks for your help !
The encoding declaration is wrong on this page so BeautifulSoup gets told to use the wrong encoding. You can force it to use the correct encoding like this:
import requests
from bs4 import BeautifulSoup
link = "https://www.lacentrale.fr/listing?makesModelsCommercialNames=FERRARI&sortBy=priceAsc"
request = requests.get(link)
page = request.content
soup = BeautifulSoup(page.decode('utf-8','ignore'), "html5lib")
price = soup.find("div", {"class": "fieldPrice sizeC"}).text
print(price)
Outputs:
49 070 €
Instead of page.content use page.text
Ex:
import requests
from bs4 import BeautifulSoup
link = "https://www.lacentrale.fr/listing?makesModelsCommercialNames=FERRARI&sortBy=priceAsc"
request = requests.get(link)
page = request.text
soup = BeautifulSoup(page, "html.parser")
price = soup.find("div", {"class" : "fieldPrice sizeC"}).text
print(price)
.text automatically decode content from the server

BeautifulSoup : Fetched all the links on a webpage how to navigate through them without selenium?

So I'm trying to write a mediocre script to download subtitles from one particular website as y'all can see. I'm a newbie to beautifulsoup, so far I have a list of all the "href" after a search query(GET). So how do I navigate further, after getting all the links?
Here's the code:
import requests
from bs4 import BeautifulSoup
usearch = input("Movie Name? : ")
url = "https://www.yifysubtitles.com/search?q="+usearch
print(url)
resp = requests.get(url)
soup = BeautifulSoup(resp.content, 'lxml')
for link in soup.find_all('a'):
dictn = link.get('href')
print(dictn)
You need to use resp.text instead of resp.content
Try this to get the search results.
import requests
from bs4 import BeautifulSoup
base_url_f = "https://www.yifysubtitles.com"
search_url = base_url_f + "/search?q=last+jedi"
resp = requests.get(search_url)
soup = BeautifulSoup(resp.text, 'lxml')
for media in soup.find_all("div", {"class": "media-body"}):
print(base_url_f + media.find('a')['href'])
out: https://www.yifysubtitles.com/movie-imdb/tt2527336

Categories

Resources