web scraper returns wrong data - python

import requests
from bs4 import BeautifulSoup
Year = input("What year would you like to travel to? YYY-MM-DD ")
URL = "https://www.billboard.com/charts/hot-100/"
URL += URL + Year
response = requests.get(URL)
data = response.text
soup = BeautifulSoup(data,"html.parser")
songs = soup.find_all(name='h3', id="title-of-a-story")
all_songs = [song.getText() for song in songs]
print(all_songs)
I'm new to web scraping ,
Its supposed to give me the list of songs in the top 100 on the year that I specify but why is it giving me news,Its giving me the wrong data

Try printing URL before making a request:
https://www.billboard.com/charts/hot-100/https://www.billboard.com/charts/hot-100/2022-01-01
That's clearly wrong, you got the base part twice. The line URL += URL + Year is the culprit, it should have been URL = URL + Year.

adding to what Sasszem# mentioned above
import requests
from bs4 import BeautifulSoup
Year = input("What year would you like to travel to? YYYY-MM-DD ")
URL = "https://www.billboard.com/charts/hot-100/"
URL = URL + Year
response = requests.get(URL)
data = response.text
songs = []
soup = BeautifulSoup(data,"html.parser")
# instead of directly jumping to the element, I found the container element first to restrict the code to a specific section of the website
container = soup.find_all(class_='lrv-a-unstyle-list lrv-u-flex lrv-u-height-100p lrv-u-flex-direction-column#mobile-max')
for x in container:
song = x.find(id="title-of-a-story") #locating the element that contains text in that specific 'container'
songs.append(song)
all_songs = [song.getText() for song in songs] #getting all the songs title in a list
print(all_songs) # ['\n\n\t\n\t\n\t\t\n\t\t\t\t\tAll I Want For Christmas Is You\t\t\n\t\n'] there is a weird prefix and suffix of stings with every title
#removing the suffix and prefix strings
final_output=[]
for i in all_songs:
final_output.append(i[14:-5])
print(final_output)

Related

How to deal with standardized html having abnormal entry

Someone was kind enough to help me put together a web scraper for a government website.
The code:
import urllib.request
from pywebcopy import save_webpage
import requests
from bs4 import BeautifulSoup as Soup
url = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Laboratory&CycleBeginYear="
year =2018# This variable can be changed to whatever year you want to parse
url = url + str(year) #combined the government url with the chosen year
response = requests.get(url)
response.raise_for_status()
soup = Soup(response.content, "html.parser")
# This class contains all 4 fields in the NHANES table
class Chemical:
def __init__(self,chemical_name,doc_file,data_file,last_updated):
self.chemical_name = chemical_name
self.doc_file = doc_file
self.data_file = data_file
self.last_updated = last_updated
chemicalArray = [] #initating array
for row in soup.find("tbody").find_all("tr"):
name, *files, date = row.find_all("td")
hrefs = [file.a["href"] for file in files] # this is where I run into an error
chemical = Chemical(name.get_text(strip=True),hrefs[0],hrefs[1],date.get_text(strip=True))
chemicalArray.append(chemical)
However for some years there is entries that look like this:
Sometimes there is no href in certain years because the data file has been withdrawn, I am not sure how to handle this case. Basically I need to figure out how to deal with the case when there is no href in the "a" tag.
Test if it has the href attribute before trying to access it.
hrefs = [file.a["href"] for file in files if file.a and "href" in file.a.attrs]

Asking the user to input something and use Beautiful Soup to parse a website

I am supposed to use Beautiful Soup 4 to obtain course information off of my school's website as an exercise. I have been at this for the past few days and my code still does not work.
The first thing I ask the user is to import the course catalog abbreviation. For example, ICS is abbreviated as Information for Computer Science. Beautiful Soup 4 is supposed to list all of the courses and how many students are enrolled.
While I was able to get the input portion to work, I still have errors or the program just stops.
Question: Is there a way for Beautiful Soup to accept user input so that when the user inputs ICS, the output would be a list of all courses that are related to ICS?
Here is the code and my attempt at it:
from bs4 import BeautifulSoup
import requests
import re
#get input for course
course = input('Enter the course:')
#Here is the page link
BASE_AVAILABILITY_URL = f"https://www.sis.hawaii.edu/uhdad/avail.classes?i=MAN&t=202010&s={course}"
#get request and response
page_response = requests.get(BASE_AVAILABILITY_URL)
#getting Beautiful Soup to gather the html content
page_content = BeautifulSoup(page_response.content, 'html.parser')
#getting course information
main = page_content.find_all(class_='parent clearfix')
main_p = "".join(str (x) for x in main)
#get the course anchor tags
main_q = BeautifulSoup(main_p, "html.parser")
courses = main.find('a', href = True)
#get each course name
#empty dictionary for course list
courses_list = []
for a in courses:
courses_list.append(a.text)
search = input('Enter the course title:')
for course in courses_list:
if re.search(search, course, re.IGNORECASE):
print(course)
This is the original code that was provided in Juypter Notebook
import requests, bs4
BASE_AVAILABILITY_URL = f"https://www.sis.hawaii.edu/uhdad/avail.classes?i=MAN&t=202010&s={course}"
#get input for course
course = input('Enter the course:')
def scrape_availability(text):
soup = bs4.BeautifulSoup(text)
r = requests.get(str(BASE_AVAILABILITY_URL) + str(course))
rows = soup.select('.listOfClasses tr')
for row in rows[1:]:
columns = row.select('td')
class_name = columns[2].contents[0]
if len(class_name) > 1 and class_name != b'\xa0':
print(class_name)
print(columns[4].contents[0])
print(columns[7].contents[0])
print(columns[8].contents[0])
What's odd is that if the user saves the html file, uploads it into Juypter Notebook, and then opens the file to be read, the courses are displayed. But, for this task, the user can not save files and it must be an outright input to get the output.
The problem with your code is page_content.find_all(class_='parent clearfix') retuns and empty list []. So thats the first thing you need to change. Looking at the html, you'll want to be looking for <table>, <tr>, <td>, tags
working off what was provided from the original code, you just need to alter a few things to flow logically:
I'll point out what I changed:
import requests, bs4
BASE_AVAILABILITY_URL = f"https://www.sis.hawaii.edu/uhdad/avail.classes?i=MAN&t=202010&s={course}"
#get input for course
course = input('Enter the course:')
def scrape_availability(text):
soup = bs4.BeautifulSoup(text) #<-- need to get the html text before creating a bs4 object. So I move the request (line below) before this, and also adjusted the parameter for this function.
# the rest of the code is fine
r = requests.get(str(BASE_AVAILABILITY_URL) + str(course))
rows = soup.select('.listOfClasses tr')
for row in rows[1:]:
columns = row.select('td')
class_name = columns[2].contents[0]
if len(class_name) > 1 and class_name != b'\xa0':
print(class_name)
print(columns[4].contents[0])
print(columns[7].contents[0])
print(columns[8].contents[0])
This will give you:
import requests, bs4
BASE_AVAILABILITY_URL = "https://www.sis.hawaii.edu/uhdad/avail.classes?i=MAN&t=202010&s="
#get input for course
course = input('Enter the course:')
url = BASE_AVAILABILITY_URL + course
def scrape_availability(url):
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text, 'html.parser')
rows = soup.select('.listOfClasses tr')
for row in rows[1:]:
columns = row.select('td')
class_name = columns[2].contents[0]
if len(class_name) > 1 and class_name != b'\xa0':
print(class_name)
print(columns[4].contents[0])
print(columns[7].contents[0])
print(columns[8].contents[0])
scrape_availability(url)

How to fetch link from a file and loop it in python?

have a txt file with values
https://www.redbook.com.au/cars/details/2019-honda-civic-50-years-edition-auto-my19/SPOT-ITM-524208/
http://www.redbook.com.au/cars/research/used/details/1968-ford-fairmont-xt-manual/SPOT-ITM-336135
http://www.redbook.com.au/cars/research/used/details/1968-ford-f100-manual/SPOT-ITM-317784
code :
from bs4 import BeautifulSoup
import requests
url = 'https://www.redbook.com.au/cars/details/2019-honda-civic-50-years-edition-auto-my19/SPOT-ITM-524208/'
headers = {'User-Agent':'Mozilla/5.0'}
page = (requests.get(url, headers=headers))
tree = html.fromstring(page.content)
car_data = {}
# Overview
if tree.xpath('//tr[td="Badge"]//following-sibling::td[2]/text()'):
badge = tree.xpath('//tr[td="Badge"]//following-sibling::td[2]/text()')[0]
car_data["badge"] = badge
if tree.xpath('//tr[td="Series"]//following-sibling::td[2]/text()'):
car_data["series"] = tree.xpath('//tr[td="Series"]//following-sibling::td[2]/text()')[0]
if tree.xpath('//tr[td="Body"]//following-sibling::td[2]/text()'):
car_data["body_small"] = tree.xpath('//tr[td="Body"]//following-sibling::td[2]/text()')[0]
df=pd.DataFrame([car_data])
output :
df=
badge body_small series
0 50 Years Edition Sedan 10th Gen
how to take all the urls from txt file and loop it so that the output will append all values into a dict or df.
expected output
badge body_small series
0 50 Years Edition Sedan 10th Gen
1 (No Badge) Sedan XT
2 (No Badge) Utility (No Series)
tried converting the file into list and used forloop
url = ['https://www.redbook.com.au/cars/details/2019-honda-civic-50-years-edition-auto-my19/SPOT-ITM-524208/','http://www.redbook.com.au/cars/research/used/details/1966-ford-falcon-deluxe-xp-manual/SPOT-ITM-386381']
headers = {'User-Agent':'Mozilla/5.0'}
for lop in url:
page = (requests.get(lop, headers=headers))
but only one url value is generating. and if there are 1000 url converting them to list will take a lot of time
The problem with your code is you are overwriting the variable 'page' again and again in the for loop, hence you will get data of the last request only.
Below is the correct code
url = ['https://www.redbook.com.au/cars/details/2019-honda-civic-50-years-edition-auto-my19/SPOT-ITM-524208/','http://www.redbook.com.au/cars/research/used/details/1966-ford-falcon-deluxe-xp-manual/SPOT-ITM-386381']
headers = {'User-Agent':'Mozilla/5.0'}
page = []
for lop in url:
page.append(requests.get(lop, headers=headers).text)
Here (The code will generate a dictionary where each entry is the url (key) + the scraped data (value))
from bs4 import BeautifulSoup
import requests
def get_cars_data(url):
cars_data = {}
# TODO read the data using requests and with BS populate 'cars_data'
return cars_data
all_cars = {}
with open('urls.txt') as f:
urls = [line.strip() for line in f.readlines()]
for url in urls:
all_cars[url] = get_cars_data(url)
print('done')
If I got your question correctly then this is the answer for you question.
from bs4 import BeautifulSoup
import requests
cars = [] # gobal array for storing each car_data object
f = open("file.txt",'r') #file.txt would contain all the links that you wish to read
#This for loop will perform your thing for each url in the file
for url in f:
car_data={} # use it as a local variable
headers = {'User-Agent':'Mozilla/5.0'}
page = (requests.get(url, headers=headers))
tree = html.fromstring(page.content)
# Overview
if tree.xpath('//tr[td="Badge"]//following-sibling::td[2]/text()'):
badge = tree.xpath('//tr[td="Badge"]//following-sibling::td[2]/text()')[0]
car_data["badge"] = badge
if tree.xpath('//tr[td="Series"]//following-sibling::td[2]/text()'):
car_data["series"] = tree.xpath('//tr[td="Series"]//following-sibling::td[2]/text()')[0]
if tree.xpath('//tr[td="Body"]//following-sibling::td[2]/text()'):
car_data["body_small"] = tree.xpath('//tr[td="Body"]//following-sibling::td[2]/text()')[0]
cars.append(car_data) #Append it to global array

I can't browse the pages Beautifulsoup

I am a beginner in Web-scraping and I am following this tutorial to extract movie data from this link, I chose to extract movies between 2016 and 2019 for the test. I get just 25 lines but I want more than 30000.
Do you think it's possible ?
this is the code :
from requests import get
from bs4 import BeautifulSoup
import csv
import pandas as pd
from time import sleep
from random import randint
from time import time
from IPython.core.display import clear_output
headers = {"Accept-Language": "en-US, en;q=0.5"}
pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(2000,2018)]
url = 'https://www.imdb.com/search/title?release_date=2016-01-01,2019-05-01'
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
start_time = time()
requests = 0
for year_url in years_url:
# For every page in the interval 1-4
for page in pages:
# Make a get request
response = get('http://www.imdb.com/search/title?release_date=' + year_url +'&sort=num_votes,desc&page=' + page, headers = headers)
# Pause the loop
sleep(randint(8,15))
# Monitor the requests
requests += 1
elapsed_time = time() - start_time
print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
# Throw a warning for non-200 status codes
if response.status_code != 200:
warn('Request: {}; Status code: {}'.format(requests, response.status_code))
# Break the loop if the number of requests is greater than expected
if requests > 72:
warn('Number of requests was greater than expected.')
# Parse the content of the request with BeautifulSoup
page_html = BeautifulSoup(response.text, 'html.parser')
# Select all the 50 movie containers from a single page
mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# The name
name = container.h3.a.text
names.append(name)
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# The IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# The Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
# The number of votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
votes.append(int(vote))
movie_ratings = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes
})
#data cleansing
movie_ratings = movie_ratings[['movie', 'year', 'imdb', 'metascore', 'votes']]
movie_ratings.head()
movie_ratings['year'].unique()
movie_ratings.to_csv('movie_ratings.csv')
Start by double checking your indentation through out (in fact - naughty naughty - it is wrong in that tutorial. I am guessing it wasn't properly proof read after publishing and the code has wrongly been left aligned repeatedly).
To illustrate, you currently have something like (reduced lines of code shown)
for year_url in years_url:
for page in pages:
response = get('http://www.imdb.com/search/title?release_date=' + year_url +'&sort=num_votes,desc&page=' + page, headers = headers)
page_html = BeautifulSoup(response.text, 'html.parser')
Your indentation means, if code runs at all, you are only working with last url you intended to visit in terms of actual html parsing.
It should be:
for year_url in years_url:
for page in pages:
response = get('http://www.imdb.com/search/title?release_date=' + year_url +'&sort=num_votes,desc&page=' + page, headers = headers)
page_html = BeautifulSoup(response.text, 'html.parser')
Indentation gives meaning in python.
https://docs.python.org/3/reference/lexical_analysis.html?highlight=indentation
Leading whitespace (spaces and tabs) at the beginning of a logical
line is used to compute the indentation level of the line, which in
turn is used to determine the grouping of statements.
It's hard to tell exactly what the issue is here because of the lack of functions but from what I see, you need to parse each page separately.
After every request, you need to parse the text. However, I suspect the main issue is the ordering of your code, I would suggest using functions.

How can I loop through multiple unknown number of pages and get their texts after the year is substituted in the url?

I am trying to extract some information based on the year entered in the url. The information extracted is from an unknown number of pages.
How can I get the new url after the year is substituted so that this url can be passed for processing the content extracted from multiple pages? Also, I want to be able to get all the information from all the unknown number of pages.
As I understood, I would need a while loop. How do I check if there exists a next page?
Is there an efficient way to do this? Thanks!
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from urllib import parse
from time import sleep
input_year = int(input("Enter year here >>: "))
def print_info(response_text):
soup = BeautifulSoup(response_text, 'lxml')
for info in soup.find_all('div', class_='grid'):
for a in info.find_all('a'):
if a.parent.name == 'div':
print (''.join(text for text in a.find_all(text=True)))
url = 'https://mywebsite.org/archive.pl?op=bytime&keyword=&year={}&page={}'.format(input_year,1)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
#current page number
page_number_tag = soup.find('span', class_='active tcenter')
page_number = page_number_tag.text
#next page number
for x in soup.find_all('div', class_='t'):
for a in x.find_all('a'):
if a.parent.name == 'div':
next_page_number = ''.join(text for text in a.find_all(text=True))
Assuming you have the variables year and page already, you can use string formatting to build a new url with those values:
base_url = url = 'https://mywebsite.com/archive.pl?op=bytime&keyword=&year=%s&page=%s'
new_url = base_url % (year, page)
Use format and pass multiple arguments like below.This is an example you can specify year and page the way you want.
year=2019
for page in range(1,10):
url = 'https://mywebsite.com/archive.pl?op=bytime&keyword=&year={}&page={}'.format(year,page)
print(url)

Categories

Resources