Adding data to a list that comes from a website - python

The problem I'm having is the line where it says list_titles.append(title)
at the very bottom, it pulls up the error:
TypeError: append() takes no keyword arguments. Help please and thank you!
Everytime it loops I want to add the item to the list.
import urllib.request
from bs4 import BeautifulSoup
import time
import sys
import pprint
list_titles = []
list_recipes = []
def user_input():
keyword = input('Please type in dinner ideas and we will give you recipes:')
return keyword
def get_html():
keyword = user_input()
search_page_url = 'http://www.foodnetwork.com/search/'
search_page = urllib.request.urlopen(search_page_url+keyword+'-') #downloads
the webpage
search_soup = BeautifulSoup(search_page,'html.parser') #create a tree data
structure out of html
### find all of the 'h3' tags with the class 'm-Med...' ....................................................
### and get the link by finding all the 'a' tags
recipe_blocks = search_soup.find_all('h3',class_='m-MediaBlock__a-
Headline')# finds h3 html tags with that class
for recipe in recipe_blocks:
recipe_url = recipe.find_all('a')[0].get('href') # get link as text
print(recipe_url)
page = urllib.request.urlopen('http:'+recipe_url)
soup = BeautifulSoup(page,'html.parser')
for name in soup.find_all('div',class_='parbase assetTitle')
[0].find_all('span',class_='o-AssetTitle__a-HeadlineText'):
title = (' '+name.get_text()
**list_titles.append(title)**
pprint.pprint(list_titles)
for ingredients in soup.find_all('div',class_='o-Ingredients__m-
Body')[0].find_all('label',class_='o-Ingredients__a-ListItemText'):
recipe = (' '+ingredients.get_text())# gets text from ingredients
list_recipes.append(recipe) # gets text from ingredients
pprint.pprint(list_recipes)

Related

Python: Get element next to href

Python code:
url = 'https://www.basketball-reference.com/players/'
initial = list(string.ascii_lowercase)
initial_url = [url + i for i in initial]
html_initial = [urllib.request.urlopen(i).read() for i in initial_url]
soup_initial = [BeautifulSoup(i, 'html.parser') for i in html_initial]
tags_initial = [i('a') for i in soup_initial]
print(tags_initial[0][50])
Results example:
Shareef Abdur-Rahim
From the example above, I want to extract the name of the players which is 'Shareef Abdur-Rahim', but I want to do it for all the tags_initial lists,
Does anyone have an idea?
Could you modify your post by adding your code so that we can help you better?
Maybe that could help you :
name = soup.findAll(YOUR_SELECTOR)[0].string
UPDATE
import re
import string
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://www.basketball-reference.com/players/'
# Alphabet
initial = list(string.ascii_lowercase)
datas = []
# URLS
urls = [url + i for i in initial]
for url in urls:
# Soup Object
soup = BeautifulSoup(urlopen(url), 'html.parser')
# Players link
url_links = soup.findAll("a", href=re.compile("players"))
for link in url_links:
# Player name
datas.append(link.string)
print("datas : ", datas)
Then, "datas" contains all the names of the players, but I advise you to do a little processing afterwards to remove some erroneous information like "..." or perhaps duplicates
There are probably better ways but I'd do it like this:
html = "a href=\"/teams/LAL/2021.html\">Los Angeles Lakers</a"
index = html.find("a href")
index = html.find(">", index) + 1
index_end = html.find("<", index)
print(html[index:index_end])
If you're using a scraper library it probably has a similar function built-in.

How can I get data from a website using BeautifulSoup and requests?

I am a beginner in web scraping, and I need help with this problem.
The website, allrecipes.com, is a website where you can find recipes based on a search, which in this case is 'pie':
link to the html file:
'view-source:https://www.allrecipes.com/search/results/?wt=pie&sort=re'
(right click-> view page source)
I want to create a program that takes a input, searches it up on allrecipes, and returns a list with tuples of the first five recipes with data such as the time that takes to make, serving yield, ingrediants, and more.
This is my program so far:
import requests
from bs4 import BeautifulSoup
def searchdata():
inp=input('what recipe would you like to search')
url ='http://www.allrecipes.com/search/results/?wt='+str(inp)+'&sort=re'
r=requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links=[]
#fill in code for finding top 3 or five links
for i in range(3)
a = requests.get(links[i])
soupa = BeautifulSoup(a.text, 'html.parser')
#fill in code to find name, ingrediants, time, and serving size with data from soupa
names=[]
time=[]
servings=[]
ratings=[]
ingrediants=[]
searchdata()
Yes, i know, my code is very messy but What should I fill in in the two code fill-in areas?
Thanks
After searching for the recipe you have to get the links of each recipe and then request again for each of those links, because the information you're looking for is not available on the search page. That would not look clean without OOP so here's the class I wrote that does what you want.
import requests
from time import sleep
from bs4 import BeautifulSoup
class Scraper:
links = []
names = []
def get_url(self, url):
url = requests.get(url)
self.soup = BeautifulSoup(url.content, 'html.parser')
def print_info(self, name):
self.get_url(f'https://www.allrecipes.com/search/results/?wt={name}&sort=re')
if self.soup.find('span', class_='subtext').text.strip()[0] == '0':
print(f'No recipes found for {name}')
return
results = self.soup.find('section', id='fixedGridSection')
articles = results.find_all('article')
texts = []
for article in articles:
txt = article.find('h3', class_='fixed-recipe-card__h3')
if txt:
if len(texts) < 5:
texts.append(txt)
else:
break
self.links = [txt.a['href'] for txt in texts]
self.names = [txt.a.span.text for txt in texts]
self.get_data()
def get_data(self):
for i, link in enumerate(self.links):
self.get_url(link)
print('-' * 4 + self.names[i] + '-' * 4)
info_names = [div.text.strip() for div in self.soup.find_all(
'div', class_='recipe-meta-item-header')]
ingredient_spans = self.soup.find_all('span', class_='ingredients-item-name')
ingredients = [span.text.strip() for span in ingredient_spans]
for i, div in enumerate(self.soup.find_all('div', class_='recipe-meta-item-body')):
print(info_names[i].capitalize(), div.text.strip())
print()
print('Ingredients'.center(len(ingredients[0]), ' '))
print('\n'.join(ingredients))
print()
print('*' * 50, end='\n\n')
chrome = Scraper()
chrome.print_info(input('What recipe would you like to search: '))

Fix the syntax error of a list comprehension that contains beautiful soup methods

I tried hard but there is always some syntax error with the piece of code that follows.
import urllib.request
import re
import csv
from bs4 import BeautifulSoup
from bs4 import NavigableString
from unicodedata import normalize
url = input('Please paste the link here: ')
html = urllib.request.urlretrieve(url)
html_file = open(html[0])
soup = BeautifulSoup(html_file, 'html5lib')
def contains_href(tag):
return tag.find('a', href=True)
scrollables = [table in soup.find_all('table', class_='sc_courselist') if contains_href(table)]
def num_name_unit(tag):
td_num = tag.find('td', href=True)
num = normalize('NFKD', td_num.string.strip())
td_name = tag.find('td', class_=False)
name = normalize('NFKD', td_name.string.strip())
td_unit = tag.find('td', class_='hourscol')
unit = normalize('NFKD', td_unit.string.strip())
row = ['Course Number: {0} | Course Name: {1} | Course Unit: {2}'.format(num, name, unit)]
return row
dic_rows = {scrollable.find_previous_siblings(re.compile('h'), class_=False, limit=1).string.strip(): list(num_name_unit(tr) for tr in scrollable.find_all('tr', contains_href)) for scrollable in scrollables}
I expect that the terminal would print the following request: "Please paste the link here: ". In reality, it says "invalid syntax" at the end of scrollables = [table in soup.find_all('table', class_='sc_courselist') if contains_href(table)].
enter image description here
You are missing the for part in your list. It should be
[table for table in soup.find_all('table', class_='sc_courselist') if contains_href(table)]

web scraping For loop stuck on first item on list- python

I'm trying to loop through a list of names, it seems it should be very simple but it's not working. Only receiving the first name on the list and nothing more! why is it not moving on to the NEXT name!?
Here's my code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('http://www.aus.edu/info/200170/college_of_architecture_art_and_design/269/department_of_architecture/4')
soup = BeautifulSoup (page.content, "html.parser")
content = soup.find(class_="supContact")
content_items = content.find_all(class_="contactToggle selected")
names = content_items[0]
s_name = []
for item in name:
s_name.append(content.find(class_="contactToggle selected").text)
if not names:
continue
print(s_name)
#print(names.prettify())

Using BeautifulSoup to scrape li's and id's in same method

How would i modify the parameters of the findAll method to read both li's and id's? li's are elements and id's are attributes correct?
#Author: David Owens
#File name: soupScraper.py
#Description: html scraper that takes surf reports from various websites
import csv
import requests
from bs4 import BeautifulSoup
###################### SURFLINE URL STRINGS AND TAG ###########################
slRootUrl = 'http://www.surfline.com/surf-report/'
slSunsetCliffs = 'sunset-cliffs-southern-california_4254/'
slScrippsUrl = 'scripps-southern-california_4246/'
slBlacksUrl = 'blacks-southern-california_4245/'
slCardiffUrl = 'cardiff-southern-california_4786/'
slTagText = 'observed-wave-range'
slTag = 'id'
#list of surfline URL endings
slUrls = [slSunsetCliffs, slScrippsUrl, slBlacksUrl, slCardiffUrl]
###############################################################################
#################### MAGICSEAWEED URL STRINGS AND TAG #########################
msRootUrl = 'http://magicseaweed.com/'
msSunsetCliffs = 'Sunset-Cliffs-Surf-Report/4211/'
msScrippsUrl = 'Scripps-Pier-La-Jolla-Surf-Report/296/'
msBlacksUrl = 'Torrey-Pines-Blacks-Beach-Surf-Report/295/'
msTagText = 'rating-text text-dark'
msTag = 'li'
#list of magicseaweed URL endings
msUrls = [msSunsetCliffs, msScrippsUrl, msBlacksUrl]
###############################################################################
'''
This method iterates through a list of urls and extracts the surf report from
the webpage dependent upon its tag location
rootUrl: The root url of each surf website
urlList: A list of specific urls to be appended to the root url for each
break
tag: the html tag where the actual report lives on the page
returns: a list of strings of each breaks surf report
'''
def extract_Reports(rootUrl, urlList, tag, tagText):
#empty list to hold reports
reports = []
#loop thru URLs
for url in urlList:
try:
#request page
request = requests.get(rootUrl + url)
#turn into soup
soup = BeautifulSoup(request.content, 'lxml')
#get the tag where report lives
reportTag = soup.findAll(id = tagText)
for report in reportTag:
reports.append(report.string.strip())
#notify if fail
except:
print 'scrape failure'
pass
return reports
#END METHOD
slReports = extract_Reports(slRootUrl, slUrls, slTag, slTagText)
msReports = extract_Reports(msRootUrl, msUrls, msTag, msTagText)
print slReports
print msReports
As of right now, only slReports prints correctly because i have it explicitly set to id = tagText. I am also aware that my tag paramater is not used currently.
So the problem is that you want to search the parse tree for elements that have either a class name of rating-text (it turns out you do not need text-dark to identify the relevant elements in the case of Magicseaweed) or an ID of observed-wave-range, using a single findAll call.
You can use a filter function to achieve this:
def reportTagFilter(tag):
return (tag.has_attr('class') and 'rating-text' in tag['class']) \
or (tag.has_attr('id') and tag['id'] == 'observed-wave-range')
Then change your extract_Reports function to read:
reportTag = soup.findAll(reportTagFilter)[0]
reports.append(reportTag.text.strip())

Categories

Resources