I've tried to create a Web Scraper for CNN. My goal is to scrape all news articles within the search query. Sometimes I get an output for some of the scraped pages and sometimes it doesn't work at all.
I am using selenium and BeautifulSoup packages in Jupiter Notebook. I am iterating over the pages via the url parameters &page={}&from={}. I tried by.XPATH before and simply clicking the next button at the end of the page, but it gave me the same results.
Here's the code I'm using:
#0 ------------import libraries
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
import feedparser
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pickle
import pandas as pd
#3 ------------CNN SCRAPER
#3.1 ----------Define Funktion
def CNN_Scraper(max_pages):
base = "https://edition.cnn.com/"
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
load_content = browser.implicitly_wait(30)
base_url = 'https://edition.cnn.com/search?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100'
#-------------Define empty lists to be scraped
CNN_title = []
CNN_date = []
CNN_article = []
article_count = 0
#-------------iterate over pages and extract
for page in range(1, max_pages + 1):
print("Page %d" % page)
url= base_url + "&page=%d&from=%d" % (page, article_count)
browser.get(url)
load_content
soup = BeautifulSoup(browser.page_source,'lxml')
search_results = soup.find('div', {'class':'cnn-search__results-list'})
contents = search_results.find_all('div', {'class':'cnn-search__result-contents'})
for content in contents:
try:
title = content.find('h3').text
print(title)
link = content.find('a')
link_url = link['href']
date = content.find('div',{'class':'cnn-search__result-publish-date'}).text.strip()
article = content.find('div',{'class':'cnn-search__result-body'}).text
except:
print("loser")
continue
CNN_title.append(title)
CNN_date.append(date)
CNN_article.append(article)
article_count += 100
print("-----")
#-------------Save in DF
df = pd.DataFrame()
df['title'] = CNN_title
df['date'] = CNN_date
df['article'] = CNN_article
df['link']=CNN_link
return df
#print("Complete")
browser.quit()
#3.2 ----------Call Function - Scrape CNN and save pickled data
CNN_data = CNN_Scraper(2)
#CNN_data.to_pickle("CNN_data")
Call the back-end API directly. For more details check my previous answer
import requests
import json
def main(url):
with requests.Session() as req:
for item in range(1, 1000, 100):
r = req.get(url.format(item)).json()
for a in r['result']:
print("Headline: {}, Url: {}".format(
a['headline'], a['url']))
main("https://search.api.cnn.io/content?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100&from={}")
Related
I'm doing a scraping exercise on a job searching webpage. I want to get the link, name of the company, job title, salary, location and posting date. I've run the same code multiple times, and sometimes it gives the expected results in the salary item (salary if the info is displayed, "N/A" otherwise) and sometimes it gives me something different: salary if the info is displayed, "N/A", and some random character values in columns whose values should be "N/A". I have no problems with the other elements. Here is my code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://ca.indeed.com/')
#Inputs a job title and location into the input boxes
input_box = driver.find_element(By.XPATH,'//*[#id="text-input-what"]')
input_box.send_keys('data analyst')
location = driver.find_element(By.XPATH,'//*[#id="text-input-where"]')
location.send_keys('toronto')
#Clicks on the search button
button = driver.find_element(By.XPATH,'//*[#id="jobsearch"]/button').click()
#Creates a dataframe
df = pd.DataFrame({'Link':[''], 'Job Title':[''], 'Company':[''], 'Location':[''],'Salary':[''], 'Date':['']})
#This loop goes through every page and grabs all the details of each posting
#Loop will only end when there are no more pages to go through
while True:
#Imports the HTML of the current page into python
soup = BeautifulSoup(driver.page_source, 'lxml')
#Grabs the HTML of each posting
postings = soup.find_all('div', class_ = 'slider_container css-g7s71f eu4oa1w0')
len(postings)
#grabs all the details for each posting and adds it as a row to the dataframe
for post in postings:
link = post.find('a').get('href')
link_full = 'https://ca.indeed.com'+link
name = post.find('h2', tabindex = '-1').text.strip()
company = post.find('span', class_ = 'companyName').text.strip()
try:
location = post.find('div', class_ = 'companyLocation').text.strip()
except:
location = 'N/A'
try:
salary = post.find('div', attrs = {'class':'heading6 tapItem-gutter metadataContainer noJEMChips salaryOnly'}).text.strip()
except:
salary = 'N/A'
date = post.find('span', class_ = 'date').text.strip()
df = df.append({'Link':link_full, 'Job Title':name, 'Company':company, 'Location':location,'Salary':salary, 'Date':date},
ignore_index = True)
#checks if there is a button to go to the next page, and if not will stop the loop
try:
button = soup.find('a', attrs = {'aria-label': 'Next'}).get('href')
driver.get('https://ca.indeed.com'+button)
except:
break
Can I fix my code to get the expected results everytime I run it? Also, an additional issue: I'm scraping around 60 pages. But usually the program stops between 20 and 30 pages before the last page. Is there a way to fix the code so that it scrapes until the last page everytime?
Here is a simplified example with requests library:
import requests
from bs4 import BeautifulSoup
cookies = {}
headers = {}
params = {
'q': 'data analyst',
'l': 'toronto',
'from': 'searchOnHP',
}
response = requests.get('https://ca.indeed.com/jobs', params=params, cookies=cookies, headers=headers)
soup = BeautifulSoup(response.text)
postings = soup.find_all('div', class_ = 'slider_container css-g7s71f eu4oa1w0')
len(postings)
prints
15
So I have a list of urls (called "data") that contains urls like
https://www.amazon.com/Airpods-Fashion-Protective-Accessories-Silicone/product-reviews/B08YD8JLNQ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
and
https://www.amazon.com/Keychain-R-fun-Protective-Accessories-Visible-Sky/product-reviews/B082W7DL1R/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
Some urls do not have the "Next Page" icon and some do. So far my code is something like this
from bs4 import BeautifulSoup
import requests
import csv
import os
import pandas as pd
from selenium import webdriver
from selenium.common import exceptions
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
df = pd.read_csv(r'path to csv file', sep=',', usecols=['Url'], squeeze=True)
data = pd.read_csv(r'path to csv file', sep=',', usecols=['Url'], squeeze=True)
rows = []
for url in data
page = requests.get(url)
soup = bs(page.content, 'html.parser')
soup.prettify
#names = soup.find_all('span', class="a-profile-name")
# div.celwidget div.aok-relative span.a-profile-name
#names = soup.find_all('div.celwidget div.aok-relative span', class= "a-profile-name")
names = soup.find_all('div.celwidget div.aok-relative span.a-profile-name')
rating = soup.find_all('div.celwidget div.aok-relative span.a-icon-alt')
title = soup.find_all('div.celwidget div.aok-relative a.a-text-bold span')
content = soup.find_all('div.celwidget div.aok-relative span.review-text-content span')
I want to scrape the names, ratings and etc from the reviews until the last item where the Next Page button would be disabled.
I'm not quite sure what to do from here, I looked around and many questions related to this was using .click() on Next Page which I don't think is the answer I need/want.
The next page url is stored in a list item with class name a-last. So you could create a while loop that breaks if soup.find('li', class_='a-last') returns nothing anymore (i.e. if the last page has been reached):
from selenium import webdriver
from bs4 import BeautifulSoup
import time
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
url='https://www.amazon.com/Keychain-R-fun-Protective-Accessories-Visible-Sky/product-reviews/B082W7DL1R/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews' #or https://www.amazon.com/s?k=maison+kitsune+airpod+pro+case
wd = webdriver.Chrome('chromedriver',options=options)
while True:
wd.get(url)
soup = BeautifulSoup(wd.page_source, "html.parser")
#store data here
try:
url = 'https://www.amazon.com/' + soup.find('li', class_='a-last').find('a', href=True)['href']
time.sleep(2) #prevent ban
except:
break
I am currently trying to extract all the reviews on Spiderman Homecoming movie but I am only able to get the first 25 reviews. I was able to load more in IMDB to get all the reviews as originally it only shows the first 25 but for some reason I am unable to mine all the reviews after every review has been loaded. Does anyone know what I am doing wrong?
Below is the code I am running:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#Set the web browser
driver = webdriver.Chrome(executable_path=r"C:\Users\Kent_\Desktop\WorkStudy\chromedriver.exe")
#Go to Google
driver.get("https://www.imdb.com/title/tt6320628/reviews?ref_=tt_urv")
#Loop load more button
wait = WebDriverWait(driver,10)
while True:
try:
driver.find_element_by_css_selector("button#load-more-trigger").click()
wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR,".ipl-load-more__load-indicator")))
soup = BeautifulSoup(driver.page_source, 'lxml')
except Exception:break
#Scrape IMBD review
ans = driver.current_url
page = requests.get(ans)
soup = BeautifulSoup(page.content, "html.parser")
all = soup.find(id="main")
#Get the title of the movie
all = soup.find(id="main")
parent = all.find(class_ ="parent")
name = parent.find(itemprop = "name")
url = name.find(itemprop = 'url')
film_title = url.get_text()
print('Pass finding phase.....')
#Get the title of the review
title_rev = all.select(".title")
title = [t.get_text().replace("\n", "") for t in title_rev]
print('getting title of reviews and saving into a list')
#Get the review
review_rev = all.select(".content .text")
review = [r.get_text() for r in review_rev]
print('getting content of reviews and saving into a list')
#Make it into dataframe
table_review = pd.DataFrame({
"Title" : title,
"Review" : review
})
table_review.to_csv('Spiderman_Reviews.csv')
print(title)
print(review)
Well, actually, there's no need to use Selenium. The data is available via sending a GET request to the websites API in the following format:
https://www.imdb.com/title/tt6320628/reviews/_ajax?ref_=undefined&paginationKey=MY-KEY
where you have to provide a key for the paginationKey in the URL (...&paginationKey=MY-KEY)
The key is found in the class load-more-data:
<div class="load-more-data" data-key="g4wp7crmqizdeyyf72ux5nrurdsmqhjjtzpwzouokkd2gbzgpnt6uc23o4zvtmzlb4d46f2swblzkwbgicjmquogo5tx2">
</div>
So, to scrape all the reviews into a DataFrame, try:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = (
"https://www.imdb.com/title/tt6320628/reviews/_ajax?ref_=undefined&paginationKey={}"
)
key = ""
data = {"title": [], "review": []}
while True:
response = requests.get(url.format(key))
soup = BeautifulSoup(response.content, "html.parser")
# Find the pagination key
pagination_key = soup.find("div", class_="load-more-data")
if not pagination_key:
break
# Update the `key` variable in-order to scrape more reviews
key = pagination_key["data-key"]
for title, review in zip(
soup.find_all(class_="title"), soup.find_all(class_="text show-more__control")
):
data["title"].append(title.get_text(strip=True))
data["review"].append(review.get_text())
df = pd.DataFrame(data)
print(df)
Output (truncated):
title review
0 Terrific entertainment Spiderman: Far from Home is not intended to be...
1 THe illusion of the identity of Spider man. Great story in continuation of spider man home...
2 What Happened to the Bad Guys I believe that Quinten Beck/Mysterio got what ...
3 Spectacular One of the best if not the best Spider-Man mov...
...
...
As I am scraping from https://www.skechers.com/women/shoes/athletic-sneakers/?start=0&sz=168 my objective is to get all product ids but it can't print
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
url = 'https://www.skechers.com/women/shoes/athletic-sneakers/?start=0&sz=168'
driver = webdriver.Chrome('D:/chromedriver')
driver.get(url)
pageSource = driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
content= soup.find_all('div',class_='col-6 col-sm-4 col-xl-3 mb-2 mb-md-1 mb-lg-4 px-lg-3')
skechersshoes=[]
for item in content:
pid=item.select_one('div[data-pid="product"]')
print(pid)
skechers={
'productid':pid
}
skechersshoes.append(skechers)
df = pd.DataFrame(skechersshoes)
print(df.head())
df.to_csv('skechers.csv')
There're couple of ways how to reach it out:
1.By facilities of selenium find all elements by xpath and class: product:
products_elements = driver.find_elements_by_xpath("//div[#class='product']")
and in the loop get 'data-pid' value and put it to the list:
skechersshoes = [product.get_attribute('data-pid') for product in products_elements]
2.By BeautifulsSoup, set pageSource, then find_all products:
products = soup.find_all(attrs={"class": "product"})
and again, via loop get all 'data-pid'
skechersshoes = [_id['data-pid'] for _id in products]
I have code that extracts links from a list of urls (and pairs them with the url they were extracted from), but I would like to alter it to grab only links on those pages that DO NOT contain target="_blank"
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
list_urls = ["https://example.com/1/","https://example.com/2/","https://example.com/3/"]
pagelinks = []
for url in list_urls:
browser.get(url)
soup = BeautifulSoup(browser.page_source,"html.parser")
for line in soup.find_all('a'):
href = line.get('href')
pagelinks.append([url, href])
df = pd.DataFrame(pagelinks, columns=["from", "to"])
Try:
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
list_urls = ["https://example.com/1/","https://example.com/2/","https://example.com/3/"]
pagelinks = []
for url in list_urls:
browser.get(url)
soup = BeautifulSoup(browser.page_source,"html.parser")
for line in soup.find_all('a'):
if not line.get('target') == '_blank':
href = line.get('href')
pagelinks.append([url, href])
df = pd.DataFrame(pagelinks, columns=["from", "to"])