Beautiful Soup IMDB - python

I am trying to get the name of IMDB top movies . But I don't know how can I get the specifically movie names
Here is source code
The Shawshank Redemption
and here is my code
import requests
from bs4 import BeautifulSoup as bs
file = open("text-txt-file.txt", "w")
imdburl1 = "https://www.imdb.com/chart/top"
r = requests.get(imdburl1)
soup = bs(r.content, "lxml")
data = soup.find_all("table", {"class":"chart full-width"})
movietable = (data[0].contents) [len(data[0].contents) - 2]
movietable = movietable.find_all("tr")
for i in movietable:
filmtitles = i.find_all("td", {"class":"titleColumn"})
for j in filmtitles:
moviename = j.find_all("a")
print() # what to do ????
input()

Run loop on moviename then get the title
for title in moviename:
print(title.get('title')) # what to do ????
full code
import requests
from bs4 import BeautifulSoup as bs
file = open("text-txt-file.txt", "w")
imdburl1 = "https://www.imdb.com/chart/top"
r = requests.get(imdburl1)
soup = bs(r.content, "lxml")
data = soup.find_all("table", {"class": "chart full-width"})
movietable = (data[0].contents)[len(data[0].contents) - 2]
movietable = movietable.find_all("tr")
for i in movietable:
filmtitles = i.find_all("td", {"class": "titleColumn"})
for j in filmtitles:
moviename = j.find_all("a")
for title in moviename:
print(title.get('title')) # what to do ????

Related

Web scraping doesn't iterate over entire webpage

Im trying to scrape the information of all the player names and player rating from this website:
https://www.fifaindex.com/players/?gender=0&league=1&order=desc
But i only get the information from the first player on the page.
The code im using:
from bs4 import BeautifulSoup
import requests
url = "https://www.fifaindex.com/players/?gender=0&league=1&order=desc"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all('div', class_="responsive-table table-rounded")
for result in results:
rating = result.find("span", class_="badge badge-dark rating r3").text
name = result.find("a", class_="link-player")
info = [rating, name]
print(info)
The HTML parsed is attached in the picture
I tinkered around a little bit and I think I got a version that does what you want
from bs4 import BeautifulSoup
import requests
page = requests.get("https://www.fifaindex.com/players/?
gender=0&league=1&order=desc")
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find_all("tr")
for result in results:
try:
result["data-playerid"]
except KeyError:
continue
rating = result.find("span", class_="badge badge-dark rating r3").text
name = result.find("a", class_="link-player")
info = [rating, name]
print(info)
Get all table lines with a data-playerid attribute will fix it:
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import requests
url = "https://www.fifaindex.com/players/?gender=0&league=1&order=desc"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
results = soup.find_all('tr', {'data-playerid': True})
for res in results:
rating = res.find("span", class_="badge badge-dark rating r3").text
name = res.find("a", class_="link-player")
info = [rating, name]
print(info)

Scraping a range of news articles with Beautiful Soup - issue with showing all articels. Full code below

#Importing libraries:
import html5lib
import nltk
from newspaper import Article
import requests
from bs4 import BeautifulSoup
import numpy as np
# select a url of any news article
url = 'https://www.standaard.be/nieuws/meest-recent'
article = Article(url)
#setting up beautiful soup requirements:
article.download()
article.parse()
nltk.download('punkt')
article.nlp()
r1 = requests.get(url)
coverpage = r1.content
soup = BeautifulSoup(coverpage, 'html5lib')
coverpage_news = soup.find_all('a', class_='link-live')
n_articles = 150
#testing for loop. Everything works fine up until here.
for i in np.arange(0, n_articles):
link = coverpage_news[i]['href']
print (link)
nr_articles = len(link)+1
nr_articles
#Should be all coming together:
Each of the sections do what they're supposed to do, but somehow the for-loop doesn't seem to extract article-titles one-by-one. Should I nest another loop in the exisiting for-loop? Any help would be highly appreciated!
# Scraping all articles
number_of_articles = nr_articles
# Empty lists for content, links and titles
news_contents = []
list_links = []
list_titles = []
for n in np.arange(0, number_of_articles):
# Getting the link of the article
link = coverpage_news[n]['href']
list_links.append(link)
# Getting the title
article = requests.get(link)
article_content = article.content
soup_article = BeautifulSoup(article_content, 'html5lib')
title = soup_article.find_all('header', class_='article__header')
y = title[0].find_all('h1')
list_titles.append(y)
# Reading the content (it is divided in paragraphs)
article = requests.get(link)
article_content = article.content
soup_article = BeautifulSoup(article_content, 'html5lib')
body = soup_article.find_all('div', class_='article__body')
x = body[0].find_all('p')
# Unifying the paragraphs
list_paragraphs = []
for p in np.arange(0, len(x)):
paragraph = x[p].get_text()
list_paragraphs.append(paragraph)
final_article = " ".join(list_paragraphs)
news_contents.append(final_article)
What happens?
Your grabbing all the links and define n_articles = 150
Then you try to assign number_of_articles = nr_articles but nr_articles is not defined.
Solution
Change n_articles = 150 to nr_articles = 150
Example
import requests
from bs4 import BeautifulSoup
import numpy as np
# select a url of any news article
url = 'https://www.standaard.be/nieuws/meest-recent'
soup = BeautifulSoup(requests.get(url).content, 'lxml')
coverpage_news = soup.find_all('a', class_='link-live')
nr_articles = 3
number_of_articles = nr_articles
# Empty lists for content, links and titles
news_contents = []
list_links = []
list_titles = []
for n in np.arange(0, number_of_articles):
link = coverpage_news[n]['href']
list_links.append(link)
# Getting the contents
article = requests.get(link)
article_content = article.content
soup_article = BeautifulSoup(article_content, 'html5lib')
list_titles.append(soup_article.select_one('header.article__header > h1').get_text())
news_contents.append(''.join([p.get_text() for p in soup_article.select('div.article__body p')]))
list_titles
Output
['Van Gucht tempert paniek over Britse variant: ‘Virus in geheel blijven bestrijden’',
'Advies experten: ‘Laat kinderen vanaf tien jaar mondmasker dragen op publieke plaatsen’',
'Parket opent onderzoek naar perslekken in de zaak-Kucam']

How can I scrape Songs Title from this request that I have collected using python

import requests
from bs4 import BeautifulSoup
r = requests.get("https://gaana.com/playlist/gaana-dj-hindi-top-50-1")
soup = BeautifulSoup(r.text, "html.parser")
result = soup.find("div", {"class": "s_c"})
print(result.class)
From the above code, I am able to scrape this data
https://www.pastiebin.com/5f08080b8db82
Now I would like to scrape only the title of the songs and then make a list out of them like the below:
Meri Aashiqui
Genda Phool
Any suggestions are much appreciated!
Try this :
import requests
from bs4 import BeautifulSoup
r = requests.get("https://gaana.com/playlist/gaana-dj-hindi-top-50-1")
soup = BeautifulSoup(r.text, "html.parser")
result = soup.find("div", {"class": "s_c"})
#print(result)
div = result.find_all('div', class_='track_npqitemdetail')
name_list = []
for x in div:
span = x.find('span').text
name_list.append(span)
print(name_list)
this code will return all song name in name_list list.

Scrape web sites with unique url (python)

I am currently working on a project of web scraping but i have difficulties with the url of the web site, because it's not changing when i'm going through the pages.
The website: https://www.centris.ca/fr/triplex~a-vendre~montreal-mercier-hochelaga-maisonneuve?uc=1&view=Thumbnail
My goal is to scrape all the buildings in the two pages.
The only way i can scrape the data is by using the inspect tool and copy the wrapper around all the ads.
This is my code:
from bs4 import BeautifulSoup
import requests
import csv
import string
import glob
#Grab the soup (content)
source = requests.get("https://www.centris.ca/fr/triplex~a-vendre~montreal-mercier-hochelaga-maisonneuve?uc=1&view=Thumbnail")
soup = BeautifulSoup(source.content, 'html.parser')
#Loop through all the ads on the page
for ad in soup.find_all('div', {"data-id":"templateThumbnailItem"}):
if (soup.find('div', {"class":"price"})):
#Get the address
address = ad.find('span', {"class":"address"})
address = address.findChild().text
address = address.strip()
#Get the district
district = ad.find('span', {"class":"address"})
district = district.findChildren()[1].text
district = district.strip()
#Get the type
typeBuilding = ad.find('span', {"class":"category"}).text
typeBuilding = typeBuilding.strip()
typeBuilding = typeBuilding[0:7].strip()
#Get the Price
price = ad.find('span', {"itemprop":"price"}).text
price = price.replace('$','')
price = price.replace(u'\xa0','')
price = int(str(price))
cnt = cnt + 1
print(f'Adresse: {address}, Quartier: {district}, Type: {typeBuilding}, Prix: {price}$')
Thank you for helping!
import requests
from bs4 import BeautifulSoup
import csv
def main(url):
with requests.Session() as req:
r = req.get(
"https://www.centris.ca/fr/triplex~a-vendre~montreal-mercier-hochelaga-maisonneuve?uc=1&view=Thumbnail")
with open("data.csv", 'w', newline="", encoding="UTF-8") as f:
writer = csv.writer(f)
writer.writerow(["Address", "Quartier", "Type", "Price"])
for num in range(0, 40, 20):
data = {'startPosition': num}
r = req.post(url, json=data).json()
html = r["d"]["Result"]["html"]
soup = BeautifulSoup(html, 'html.parser')
prices = [format(int(price.get("content")), ',d') for price in soup.findAll(
"span", itemprop="price")]
block = soup.findAll("div", class_="location-container")
ty = [ty.div.get_text(strip=True) for ty in block]
add = [add.select_one(
"span.address div").text for add in block]
quartier = [quar.select_one(
"span.address div:nth-child(2)").text for quar in block]
final = zip(add, quartier, ty, prices)
writer.writerows(final)
main("https://www.centris.ca/Mvc/Property/GetInscriptions")
Output: View Online

Access Hidden Data on a page

I need to access the following website: http://mothoq.com/store/22
scroll down till i see the phone icon.
click on it, and scrape the phone number.
I have successfully connected to the website, and able to scrape all data needed, except of the phone number.
I have tried to use
soup.find_all('p',attrs={"align":"center"})
my code is:
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "html5lib")
results = soup.find('div', attrs={'id': 'subtitle'})
for storeData in results:
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
for storeContact in contacts:
storePhone = soup.find_all('p', attrs={"align":"center"})
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"})['href']
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"})['href']
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"})['href']
print(storePhone)
Thanks!
You should search for hidden div with id="store-telephone-form" and take second
<p> tag from it.
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "lxml")
results = soup.find('div', attrs={'id': 'subtitle'})
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
try:
storePhone = soup.find('div', attrs={"id":"store-telephone-form"}).select('p')[1].text
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"}).get('href')
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"}).get('href')
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"}).get('href')
except:
pass
print(storePhone)

Categories

Resources