Python code:
url = 'https://www.basketball-reference.com/players/'
initial = list(string.ascii_lowercase)
initial_url = [url + i for i in initial]
html_initial = [urllib.request.urlopen(i).read() for i in initial_url]
soup_initial = [BeautifulSoup(i, 'html.parser') for i in html_initial]
tags_initial = [i('a') for i in soup_initial]
print(tags_initial[0][50])
Results example:
Shareef Abdur-Rahim
From the example above, I want to extract the name of the players which is 'Shareef Abdur-Rahim', but I want to do it for all the tags_initial lists,
Does anyone have an idea?
Could you modify your post by adding your code so that we can help you better?
Maybe that could help you :
name = soup.findAll(YOUR_SELECTOR)[0].string
UPDATE
import re
import string
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://www.basketball-reference.com/players/'
# Alphabet
initial = list(string.ascii_lowercase)
datas = []
# URLS
urls = [url + i for i in initial]
for url in urls:
# Soup Object
soup = BeautifulSoup(urlopen(url), 'html.parser')
# Players link
url_links = soup.findAll("a", href=re.compile("players"))
for link in url_links:
# Player name
datas.append(link.string)
print("datas : ", datas)
Then, "datas" contains all the names of the players, but I advise you to do a little processing afterwards to remove some erroneous information like "..." or perhaps duplicates
There are probably better ways but I'd do it like this:
html = "a href=\"/teams/LAL/2021.html\">Los Angeles Lakers</a"
index = html.find("a href")
index = html.find(">", index) + 1
index_end = html.find("<", index)
print(html[index:index_end])
If you're using a scraper library it probably has a similar function built-in.
Related
I am trying to get all emails from a specific page and separate them into an individual variable or even better a dictionary. This is some code.
import requests
import re
import json
from bs4 import BeautifulSoup
page = "http://www.example.net"
info = requests.get(page)
if info.status_code == 200:
print("Page accessed")
else:
print("Error accessing page")
code = info.content
soup = BeautifulSoup(code, 'lxml')
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
print(allEmails)
sep = ","
allEmailsStr = str(allEmails)
print(type(allEmails))
print(type(allEmailsStr))
j = allEmailsStr.split(sep, 1)[0]
print(j)
Excuse the poor variable names because I put this together so it would be fine by itself. The output from the example website would be for example something like
[k, kolyma, location, balkans]
So if I ran the problem it would return only
[k
But if I wanted it to return every email on there individually how would I do that?
To get just the email str you can try:
emails = []
for email_link in allEmails:
emails.append(email_link.get("href").replace('mailto:', ''))
print(emails)
Based on your expected output, you can use the unwrap function of BeautifulSoup
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
for Email in allEmails:
print(Email.unwrap()) #This will print the whole element along with tag
# k
I suspect this isn't very complicated, but I can't see to figure it out. I'm using Selenium and Beautiful Soup to parse Petango.com. Data will be used to help a local shelter understand how they compare in different metrics to other area shelters. so next will be taking these data frames and doing some analysis.
I grab detail urls from a different module and import the list here.
My issue is, my lists are only showing the value from the HTML from the first dog. I was stepping through and noticed my len are different for the soup iterations, so I realize my error is after that somewhere but I can't figure out how to fix.
Here is my code so far (running the whole process vs using a cached page)
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from Petango import pet_links
headings = []
values = []
ShelterInfo = []
ShelterInfoWebsite = []
ShelterInfoEmail = []
ShelterInfoPhone = []
ShelterInfoAddress = []
Breed = []
Age = []
Color = []
SpayedNeutered = []
Size = []
Declawed = []
AdoptionDate = []
# to access sites, change url list to pet_links (break out as needed) and change if false to true. false looks to the html file
url_list = (pet_links[4], pet_links[6], pet_links[8])
#url_list = ("Petango.html", "Petango.html", "Petango.html")
for link in url_list:
page_source = None
if True:
#pet page = link should populate links from above, hard code link was for 1 detail page, =to hemtl was for cached site
PetPage = link
#PetPage = 'https://www.petango.com/Adopt/Dog-Terrier-American-Pit-Bull-45569732'
#PetPage = Petango.html
PetDriver = webdriver.Chrome(executable_path='/Users/paulcarson/Downloads/chromedriver')
PetDriver.implicitly_wait(30)
PetDriver.get(link)
page_source = PetDriver.page_source
PetDriver.close()
else:
with open("Petango.html",'r') as f:
page_source = f.read()
PetSoup = BeautifulSoup(page_source, 'html.parser')
print(len(PetSoup.text))
#get the details about the shelter and add to lists
ShelterInfo.append(PetSoup.find('div', class_ = "DNNModuleContent ModPethealthPetangoDnnModulesShelterShortInfoC").find('h4').text)
ShelterInfoParagraphs = PetSoup.find('div', class_ = "DNNModuleContent ModPethealthPetangoDnnModulesShelterShortInfoC").find_all('p')
First_Paragraph = ShelterInfoParagraphs[0]
if "Website" not in First_Paragraph.text:
raise AssertionError("first paragraph is not about site")
ShelterInfoWebsite.append(First_Paragraph.find('a').text)
Second_Paragraph = ShelterInfoParagraphs[1]
ShelterInfoEmail.append(Second_Paragraph.find('a')['href'])
Third_Paragraph = ShelterInfoParagraphs[2]
ShelterInfoPhone.append(Third_Paragraph.find('span').text)
Fourth_Paragraph = ShelterInfoParagraphs[3]
ShelterInfoAddress.append(Fourth_Paragraph.find('span').text)
#get the details about the pet
ul = PetSoup.find('div', class_='group details-list').ul # Gets the ul tag
li_items = ul.find_all('li') # Finds all the li tags within the ul tag
for li in li_items:
heading = li.strong.text
headings.append(heading)
value = li.span.text
if value:
values.append(value)
else:
values.append(None)
Breed.append(values[0])
Age.append(values[1])
print(Age)
Color.append(values[2])
SpayedNeutered.append(values[3])
Size.append(values[4])
Declawed.append(values[5])
AdoptionDate.append(values[6])
ShelterDF = pd.DataFrame(
{
'Shelter': ShelterInfo,
'Shelter Website': ShelterInfoWebsite,
'Shelter Email': ShelterInfoEmail,
'Shelter Phone Number': ShelterInfoPhone,
'Shelter Address': ShelterInfoAddress
})
PetDF = pd.DataFrame(
{'Breed': Breed,
'Age': Age,
'Color': Color,
'Spayed/Neutered': SpayedNeutered,
'Size': Size,
'Declawed': Declawed,
'Adoption Date': AdoptionDate
})
print(PetDF)
print(ShelterDF)
output from print len and print the age value as the loop progresses
12783
['6y 7m']
10687
['6y 7m', '6y 7m']
10705
['6y 7m', '6y 7m', '6y 7m']
Could someone please point me in the right direction?
Thank you for your help!
Paul
You need to change the find method into find_all() in BeautifulSoup so it locates all the elements.
Values is global and you only ever append the first value in this list to Age
Age.append(values[1])
Same problem for your other global lists (static index whether 1 or 2 etc...).
You need a way to track the appropriate index to use perhaps through a counter, or determine other logic to ensure current value is added e.g. with current Age, is it is the second li in the loop? Or just append PetSoup.select_one("[data-bind='text: age']").text
It looks like each item of interest e.g. colour, spayed contains the data-bind attribute so you can use those with the appropriate attribute value to select each value and avoid a loop over li elements.
e.g. current_colour = PetSoup.select_one("[data-bind='text: color']").text
Best to set in a variable and test is not None before accessing with .text
I am a beginner in web scraping, and I need help with this problem.
The website, allrecipes.com, is a website where you can find recipes based on a search, which in this case is 'pie':
link to the html file:
'view-source:https://www.allrecipes.com/search/results/?wt=pie&sort=re'
(right click-> view page source)
I want to create a program that takes a input, searches it up on allrecipes, and returns a list with tuples of the first five recipes with data such as the time that takes to make, serving yield, ingrediants, and more.
This is my program so far:
import requests
from bs4 import BeautifulSoup
def searchdata():
inp=input('what recipe would you like to search')
url ='http://www.allrecipes.com/search/results/?wt='+str(inp)+'&sort=re'
r=requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links=[]
#fill in code for finding top 3 or five links
for i in range(3)
a = requests.get(links[i])
soupa = BeautifulSoup(a.text, 'html.parser')
#fill in code to find name, ingrediants, time, and serving size with data from soupa
names=[]
time=[]
servings=[]
ratings=[]
ingrediants=[]
searchdata()
Yes, i know, my code is very messy but What should I fill in in the two code fill-in areas?
Thanks
After searching for the recipe you have to get the links of each recipe and then request again for each of those links, because the information you're looking for is not available on the search page. That would not look clean without OOP so here's the class I wrote that does what you want.
import requests
from time import sleep
from bs4 import BeautifulSoup
class Scraper:
links = []
names = []
def get_url(self, url):
url = requests.get(url)
self.soup = BeautifulSoup(url.content, 'html.parser')
def print_info(self, name):
self.get_url(f'https://www.allrecipes.com/search/results/?wt={name}&sort=re')
if self.soup.find('span', class_='subtext').text.strip()[0] == '0':
print(f'No recipes found for {name}')
return
results = self.soup.find('section', id='fixedGridSection')
articles = results.find_all('article')
texts = []
for article in articles:
txt = article.find('h3', class_='fixed-recipe-card__h3')
if txt:
if len(texts) < 5:
texts.append(txt)
else:
break
self.links = [txt.a['href'] for txt in texts]
self.names = [txt.a.span.text for txt in texts]
self.get_data()
def get_data(self):
for i, link in enumerate(self.links):
self.get_url(link)
print('-' * 4 + self.names[i] + '-' * 4)
info_names = [div.text.strip() for div in self.soup.find_all(
'div', class_='recipe-meta-item-header')]
ingredient_spans = self.soup.find_all('span', class_='ingredients-item-name')
ingredients = [span.text.strip() for span in ingredient_spans]
for i, div in enumerate(self.soup.find_all('div', class_='recipe-meta-item-body')):
print(info_names[i].capitalize(), div.text.strip())
print()
print('Ingredients'.center(len(ingredients[0]), ' '))
print('\n'.join(ingredients))
print()
print('*' * 50, end='\n\n')
chrome = Scraper()
chrome.print_info(input('What recipe would you like to search: '))
The problem I'm having is the line where it says list_titles.append(title)
at the very bottom, it pulls up the error:
TypeError: append() takes no keyword arguments. Help please and thank you!
Everytime it loops I want to add the item to the list.
import urllib.request
from bs4 import BeautifulSoup
import time
import sys
import pprint
list_titles = []
list_recipes = []
def user_input():
keyword = input('Please type in dinner ideas and we will give you recipes:')
return keyword
def get_html():
keyword = user_input()
search_page_url = 'http://www.foodnetwork.com/search/'
search_page = urllib.request.urlopen(search_page_url+keyword+'-') #downloads
the webpage
search_soup = BeautifulSoup(search_page,'html.parser') #create a tree data
structure out of html
### find all of the 'h3' tags with the class 'm-Med...' ....................................................
### and get the link by finding all the 'a' tags
recipe_blocks = search_soup.find_all('h3',class_='m-MediaBlock__a-
Headline')# finds h3 html tags with that class
for recipe in recipe_blocks:
recipe_url = recipe.find_all('a')[0].get('href') # get link as text
print(recipe_url)
page = urllib.request.urlopen('http:'+recipe_url)
soup = BeautifulSoup(page,'html.parser')
for name in soup.find_all('div',class_='parbase assetTitle')
[0].find_all('span',class_='o-AssetTitle__a-HeadlineText'):
title = (' '+name.get_text()
**list_titles.append(title)**
pprint.pprint(list_titles)
for ingredients in soup.find_all('div',class_='o-Ingredients__m-
Body')[0].find_all('label',class_='o-Ingredients__a-ListItemText'):
recipe = (' '+ingredients.get_text())# gets text from ingredients
list_recipes.append(recipe) # gets text from ingredients
pprint.pprint(list_recipes)
How would i modify the parameters of the findAll method to read both li's and id's? li's are elements and id's are attributes correct?
#Author: David Owens
#File name: soupScraper.py
#Description: html scraper that takes surf reports from various websites
import csv
import requests
from bs4 import BeautifulSoup
###################### SURFLINE URL STRINGS AND TAG ###########################
slRootUrl = 'http://www.surfline.com/surf-report/'
slSunsetCliffs = 'sunset-cliffs-southern-california_4254/'
slScrippsUrl = 'scripps-southern-california_4246/'
slBlacksUrl = 'blacks-southern-california_4245/'
slCardiffUrl = 'cardiff-southern-california_4786/'
slTagText = 'observed-wave-range'
slTag = 'id'
#list of surfline URL endings
slUrls = [slSunsetCliffs, slScrippsUrl, slBlacksUrl, slCardiffUrl]
###############################################################################
#################### MAGICSEAWEED URL STRINGS AND TAG #########################
msRootUrl = 'http://magicseaweed.com/'
msSunsetCliffs = 'Sunset-Cliffs-Surf-Report/4211/'
msScrippsUrl = 'Scripps-Pier-La-Jolla-Surf-Report/296/'
msBlacksUrl = 'Torrey-Pines-Blacks-Beach-Surf-Report/295/'
msTagText = 'rating-text text-dark'
msTag = 'li'
#list of magicseaweed URL endings
msUrls = [msSunsetCliffs, msScrippsUrl, msBlacksUrl]
###############################################################################
'''
This method iterates through a list of urls and extracts the surf report from
the webpage dependent upon its tag location
rootUrl: The root url of each surf website
urlList: A list of specific urls to be appended to the root url for each
break
tag: the html tag where the actual report lives on the page
returns: a list of strings of each breaks surf report
'''
def extract_Reports(rootUrl, urlList, tag, tagText):
#empty list to hold reports
reports = []
#loop thru URLs
for url in urlList:
try:
#request page
request = requests.get(rootUrl + url)
#turn into soup
soup = BeautifulSoup(request.content, 'lxml')
#get the tag where report lives
reportTag = soup.findAll(id = tagText)
for report in reportTag:
reports.append(report.string.strip())
#notify if fail
except:
print 'scrape failure'
pass
return reports
#END METHOD
slReports = extract_Reports(slRootUrl, slUrls, slTag, slTagText)
msReports = extract_Reports(msRootUrl, msUrls, msTag, msTagText)
print slReports
print msReports
As of right now, only slReports prints correctly because i have it explicitly set to id = tagText. I am also aware that my tag paramater is not used currently.
So the problem is that you want to search the parse tree for elements that have either a class name of rating-text (it turns out you do not need text-dark to identify the relevant elements in the case of Magicseaweed) or an ID of observed-wave-range, using a single findAll call.
You can use a filter function to achieve this:
def reportTagFilter(tag):
return (tag.has_attr('class') and 'rating-text' in tag['class']) \
or (tag.has_attr('id') and tag['id'] == 'observed-wave-range')
Then change your extract_Reports function to read:
reportTag = soup.findAll(reportTagFilter)[0]
reports.append(reportTag.text.strip())