Web scraping with BS4 - python

I have a problem with scraping some basic info about movies from imdb.com. I want my program to get title and description of a movie from a given URL. The title part is doing its job, however I can't figure out how to get the description. Here's my code:
import requests
from bs4 import BeautifulSoup as bs
def get_data(url):
r = requests.get(url, headers={'Accept-Language': 'en-US,en;q=0.5'})
if not r or 'https://www.imdb.com/title' not in url:
return print('Invalid movie page!')
return r.content
if __name__ == '__main__':
# print('Input the URL:')
# link = input()
link = 'https://www.imdb.com/title/tt0111161'
data = get_data(link)
soup = bs(data, 'html.parser')
title = ' '.join(soup.find('h1').text.split()[:-1])
desc = soup.find('p', {'data-testid':"plot", 'class':"GenresAndPlot__Plot-cum89p-8 kmrpno"}).text
movie_info = {'title': title, 'description': desc}
print(movie_info)
When I run it I get an error:
Exception has occurred: AttributeError
'NoneType' object has no attribute 'text'
File "movie-scraper.py", line 18, in <module>
desc = soup.find('p', {'data-testid':"plot", 'class':"GenresAndPlot__Plot-cum89p-8 kmrpno"}).text
How do I access the description properly?

To get plot summary, change the selector to find class="plot_summary":
import requests
from bs4 import BeautifulSoup as bs
def get_data(url):
r = requests.get(url, headers={"Accept-Language": "en-US,en;q=0.5"})
if not r or "https://www.imdb.com/title" not in url:
return print("Invalid movie page!")
return r.content
if __name__ == "__main__":
link = "https://www.imdb.com/title/tt0111161"
data = get_data(link)
soup = bs(data, "html.parser")
title = " ".join(soup.find("h1").text.split()[:-1])
desc = soup.find("div", class_="plot_summary").get_text(strip=True) # <-- change this to find class="plot_summary"
movie_info = {"title": title, "description": desc}
print(movie_info)
Prints:
{'title': 'The Shawshank Redemption', 'description': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.Director:Frank DarabontWriters:Stephen King(short story "Rita Hayworth and Shawshank Redemption"),Frank Darabont(screenplay)Stars:Tim Robbins,Morgan Freeman,Bob Gunton|See full cast & crew»'}

Related

Trying to scrape other category with beautifulsoup

this is the website i am trying to scrape:
[https://www.jurongpoint.com.sg/store-directory/]
This is my code,as u can see i don't know how to fill both of the {} for the url variable as the 4 category that i want to scrape especially url for service is very different. The comment above url variable shows the link of the 4 category when clicked in. Appreciate any help,thank you!
from bs4 import BeautifulSoup
import requests
def parse():
cate=["Service","Food & Beverage","Fashion & Accessories","Electronics & Technology"]
#cate=Food+%26+Beverage
#cate=Electronics+%26+Technology
#cate=Fashion+%26+Accessories
#cate=Services
url="https://www.jurongpoint.com.sg/store-directory/?level=&cate={}+%26+{}"
for cat in cate:
for page in range(1,14):
print(page)
soup = BeautifulSoup(requests.get(url).text ,"html.parser")
for link in soup.find_all('div',class_='entry-content'):
try:
shops=soup.find_all('div',class_="col-9")
names=soup.find_all('tr',class_="clickable")
for n, k in zip(names, shops):
name = n.find_all('td')[1].text.replace(' ','')
desc = k.text.replace(' ','')
print(name + "\n")
print(desc)
except AttributeError as e:
print(e)
next_button = soup.select_one('.PagedList-skipToNext a')
if next_button:
url = next_button.get('href')
else:
break
parse()
Use parameters of your request and avoid to manage escape characters (like %26)
url = "https://www.jurongpoint.com.sg/store-directory"
for cat in cate:
for page in range(1, 14):
print(f'Scraping category {cat} page {page}')
payload = {
'level': '',
'cate': cat,
'page': page
}
resp = requests.get(url, params=payload)
soup = BeautifulSoup(resp.text, 'html.parser')
# your code here
>>> resp.url
'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Electronics+%26+Technology&page=8'

How to extract all data having same class using python

i need to find all title data and stock number. I wrote a code and its works well with single item when I use find_all method it shows en error please have a ook at my code and guide me how can I handle this. thank you!
here is my code:
import requests
from bs4 import BeautifulSoup
#import pandas as pd
#import numpy as
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_page(soup):
title = soup.find_all('div',class_="vehicle-location-name mts bold",id=False).text
print(title)
stock = soup.find_all('div',class_="text-lightgray",id=False).find('span').text
print(stock)
def main():
url = "https://www.superbrightleds.com/vehicle/2002-acura-cl-vehicle-led-lights?make=1&model=554&year=2002"
get_detail_page(get_page(url))
if __name__ == '__main__':
main()
Try:
def get_detail_page(soup):
titles = soup.findAll('div', attrs={"class": "vehicle-location-name mts bold"})
stocks = soup.findAll('div', attrs={"class": "text-lightgray"})
title = [title.get_text() for title in titles if title]
stock = [stock.get_text() for stock in stocks if stock and 'Stock #' in str(stock)]
for idx in range(len(stock)):
print(f'{title[idx]}\n\t{stock[idx]}')

how to get products names from amazon

I saw all the relevant previous topics about that manner and i've learned a lot (especially about the difference between lxml and html.parser)
anyway after i changed my BeautifulSoup to parse the page as lxml I still can't get all the time the same result .
most of the time i get : " name = soup.find('span', id="productTitle").text
AttributeError: 'NoneType' object has no attribute 'text' "
but in a few times i get the real name of the product.
what am I missing?
import requests
from bs4 import BeautifulSoup
def main():
url = "https://www.amazon.com/Homego-Bluetooth-Wrist-Smart-Handsfree/dp/B01DOULDN0/ref=sr_1_1?keywords=smart+watch&qid=1569450390&sr=8-1"
client = requests.get(url, headers={"User-Agent": "Defined"})
try:
client.raise_for_status()
except requests.exceptions.HTTPError as e:
print("Error!!!!" + str(e))
soup = BeautifulSoup(client.content, 'lxml')
name = soup.find('span', id="productTitle").text
title = name[161:len(name)-128]
print("the title is: ", title)
if __name__ == '__main__':
main()
Presumably you are getting different html back. Check the html itself. You can add in a test of whether variables are None before attempting to access .text
import requests
from bs4 import BeautifulSoup
def main():
url = "https://www.amazon.com/Homego-Bluetooth-Wrist-Smart-Handsfree/dp/B01DOULDN0/ref=sr_1_1?keywords=smart+watch&qid=1569450390&sr=8-1"
client = requests.get(url, headers={"User-Agent": "Defined"})
try:
client.raise_for_status()
except requests.exceptions.HTTPError as e:
print("Error!!!!" + str(e))
soup = BeautifulSoup(client.content, 'lxml')
name = soup.find('span', id="productTitle")
if name is None:
name = 'N/A'
title = 'n/a'
else:
name = name.text
title = name[161:len(name)-128]
print("the title is: ", title)
if __name__ == '__main__':
main()

TypeError: 'NoneType' object does not support item assignment in web application

I'm trying to develop an application which can download videos and sound tracks from Youtube, but sometimes it pops out the TypeError shown by the link after making a query like "harry potter" or "lord of the rings".
TypeError Message
And here is my python code:
from __future__ import unicode_literals
import requests
from bs4 import BeautifulSoup
import youtube_dl
def find_search_content(search):
request = requests.get("https://www.youtube.com/results?search_query={}".format(search))
content = request.content
soup = BeautifulSoup(content, "html.parser")
return soup
def find_page_content(search):
request = requests.get("https://www.youtube.com/results?{}".format(search))
content = request.content
soup = BeautifulSoup(content, "html.parser")
return soup
def find_video(soup, all_item, i=1):
for element in soup.find_all('a', {"rel": "spf-prefetch"}):
video_title = element.get('title')
video_link = element.get('href')
img_value = element.get('href').split("=")[1]
img = "https://i.ytimg.com/vi/{}/hqdefault.jpg".format(img_value)
all_item['{}'.format(i)] = {"title": video_title, "link": "https://www.youtube.com{}".format(video_link),
'img': img}
i += 1
return all_item
def video_time(soup, all_item, i=1):
for time in soup.find_all('span', {"class": "video-time"}):
all_item.get('{}'.format(i))['time'] = time.text
i += 1
return all_item
def every_video(soup):
all_item = {}
find_video(soup, all_item, i=1)
video_time(soup, all_item, i=1)
return all_item
Thank you very much!

BeautifulSoup and Python scraping around a js tag, maybe?

Trying to get multiple headlines, links and dates. Only getting the first one. Not sure why BS4 won't fetch all the items...Is it a javascript problem?
from bs4 import BeautifulSoup
from urllib import urlopen
html = urlopen("http://www.fiercepharma.com/news")
soup = BeautifulSoup(html.read().decode('utf-8'),"lxml")
main_div = soup.select_one("div#content")
div_sub = main_div.select("div.region.region-content")
for d in div_sub:
date = d.time.get_text()
headline = d.h2.a.get_text()
url = d.a["href"]
print headline, url, date
What about using the following to capture all the articles that contain links, authors, posting dates on the main page. You could store this in a dictionary, or store it in a pandas dataframe for easy manipulation.
from bs4 import BeautifulSoup
import requests
baseurl = 'http://www.fiercepharma.com'
response = requests.get(baseurl)
soup = BeautifulSoup(response.content)
cdict = {}
for group in soup.find_all('div', {'class' : 'card horizontal views-row'}):
try:
title = group.find('h2', {'class' : 'field-content list-title'}).text
link = baseurl + group.find('h2', {'class' : 'field-content list-title'}).find('a', href=True)['href']
author = group.find('span', {'class' : 'field-content'}).find('a').text
time = group.find('span', {'class' : 'field-content'}).find('time').text
content = group.find('p', {'class' : 'field-content card-text'}).text
cdict[link] = {'title' : title, 'author' : author, 'time' : time, 'content' : content}
except AttributeError as e:
print('[-] Unable to parse {}'.format(e))
print(cdict)
#{'http://www.fiercepharma.com/manufacturing/lonza-bulks-up-5-5b-deal-for-capsugel': {'author': u'Eric Palmer',
# 'content': u'Swiss CDMO Lonza has pulled the trigger on a $5.5 billion deal to acquire the U.S.-based contract capsule and drug producer Capsugel to create another sizable\u2026',
# 'time': u'Dec 15, 2016 8:45am',
# 'title': u'Lonza bulks up with $5.5B deal for Capsugel'},
Both div.card.horizontal.views-row and .card.horizontal.views-row should work #citra_amarillo. I ran this and it work bothways
from bs4 import BeautifulSoup
from urllib import urlopen
html = urlopen("http://www.fiercepharma.com/news")
soup = BeautifulSoup(html.read().decode('utf-8'),"lxml")
main_div = soup.select_one("div#content")
div_sub = main_div.select(".card.horizontal.views-row")
#div_sub = main_div.select("div.card.horizontal.views-row")
for d in div_sub:
date = d.time.get_text()
headline = d.h2.a.get_text()
url = d.a["href"]
print headline, url, date

Categories

Resources