I am a beginner in web scraping, and I need help with this problem.
The website, allrecipes.com, is a website where you can find recipes based on a search, which in this case is 'pie':
link to the html file:
'view-source:https://www.allrecipes.com/search/results/?wt=pie&sort=re'
(right click-> view page source)
I want to create a program that takes a input, searches it up on allrecipes, and returns a list with tuples of the first five recipes with data such as the time that takes to make, serving yield, ingrediants, and more.
This is my program so far:
import requests
from bs4 import BeautifulSoup
def searchdata():
inp=input('what recipe would you like to search')
url ='http://www.allrecipes.com/search/results/?wt='+str(inp)+'&sort=re'
r=requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links=[]
#fill in code for finding top 3 or five links
for i in range(3)
a = requests.get(links[i])
soupa = BeautifulSoup(a.text, 'html.parser')
#fill in code to find name, ingrediants, time, and serving size with data from soupa
names=[]
time=[]
servings=[]
ratings=[]
ingrediants=[]
searchdata()
Yes, i know, my code is very messy but What should I fill in in the two code fill-in areas?
Thanks
After searching for the recipe you have to get the links of each recipe and then request again for each of those links, because the information you're looking for is not available on the search page. That would not look clean without OOP so here's the class I wrote that does what you want.
import requests
from time import sleep
from bs4 import BeautifulSoup
class Scraper:
links = []
names = []
def get_url(self, url):
url = requests.get(url)
self.soup = BeautifulSoup(url.content, 'html.parser')
def print_info(self, name):
self.get_url(f'https://www.allrecipes.com/search/results/?wt={name}&sort=re')
if self.soup.find('span', class_='subtext').text.strip()[0] == '0':
print(f'No recipes found for {name}')
return
results = self.soup.find('section', id='fixedGridSection')
articles = results.find_all('article')
texts = []
for article in articles:
txt = article.find('h3', class_='fixed-recipe-card__h3')
if txt:
if len(texts) < 5:
texts.append(txt)
else:
break
self.links = [txt.a['href'] for txt in texts]
self.names = [txt.a.span.text for txt in texts]
self.get_data()
def get_data(self):
for i, link in enumerate(self.links):
self.get_url(link)
print('-' * 4 + self.names[i] + '-' * 4)
info_names = [div.text.strip() for div in self.soup.find_all(
'div', class_='recipe-meta-item-header')]
ingredient_spans = self.soup.find_all('span', class_='ingredients-item-name')
ingredients = [span.text.strip() for span in ingredient_spans]
for i, div in enumerate(self.soup.find_all('div', class_='recipe-meta-item-body')):
print(info_names[i].capitalize(), div.text.strip())
print()
print('Ingredients'.center(len(ingredients[0]), ' '))
print('\n'.join(ingredients))
print()
print('*' * 50, end='\n\n')
chrome = Scraper()
chrome.print_info(input('What recipe would you like to search: '))
Related
So i'm trying to scrape a html webpage. It has novel chapters and i'm trying to get the text and store in text files to read offline. I don't have any previous experience with html or other things either. So the webpage I am trying to scrape is this. And the code i've been testing so far looks like this
`
import sys
import requests
import time
import re
from bs4 import BeautifulSoup
def browse_and_scrape(seed_url, page_number=1):
# Fetch the URL - We will be using this to append to images and info routes
url_pat = re.compile(r"(http://.*\.org)")
source_url = url_pat.search(seed_url).group(0)
# Page_number from the argument gets formatted in the URL & Fetched
formatted_url = seed_url.format(str(page_number))
# print(url_pat,source_url,formatted_url)
try:
html_text = requests.get(formatted_url).text
# print(html_text)
# Prepare the soup
soup = BeautifulSoup(html_text, "html.parser")
print(soup.find_all(id="chapterContent")[0]["style"])
print(f"Now Scraping - {formatted_url}")
# help = soup.find_all("div",class_="chapter-content text-normal")[0].text.strip().encode("ascii", "ignore").decode("ascii")
# for node in soup.findAll("div",class_="chapter-content text-normal"):
# print(node)
# print(''.join(node.findAll(text=True)))
# for node in soup.findAll("div"):
# # print(node)
# print(''.join(node.findAll(text=True)))
# help = soup.find_all("div",class_="chapter-content text-normal")[0]
# print(''.join(help.findAll(text=True)))
# print(help)
except Exception as e:
return e
return true
if __name__ == "__main__":
# seed_url = "http://books.toscrape.com/catalogue/page-{}.html"
seed_url = "http://wnmtl.org/chapter/324909-heavenly-wolf-valley.html"
# seed_url = "http://wnmtl.org/chapter/{}.html"
print("Web scraping has begun")
result = browse_and_scrape(seed_url)
if result == True:
print("Web scraping is now complete!")
else:
print(f"Oops, That doesn't seem right!!! - {result}")`
All the commented stuff are things i've been trying to rip the text from the tag. From my inspection of the developer console in the browser, all the text is in the tag with id of chapter content. My plan is to iteratively get the text, stuff it, get the link for the next page and repeat but i've been stuck for a bit now, any suggestions.
Instead of scraping each page, you can directly get the text from this API endpoint using requests.
https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/324909
The last item in the above API is the chapter ID (324909). You can navigate to chapters by giving in the chapter IDs.
The next and prev chapter IDs are present in the current chapter's API endpoint. Have a look at the above URL in browser to understand it better.
Here is the full recursive code that writes the text from 3 pages to a file called novel.txt. You may change the number of pages and other details as per your need.
import requests
def get_data(chapter_id, pages):
if pages == 0:
return
url = 'https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/' + str(chapter_id)
r = requests.get(url)
x = r.json()
pre_id = x['data']['preId']
next_id = x['data']['nextId']
title = x['data']['title']
content = x['data']['content']
chapter_title = f'\n***** Chapter: {title} *****\n'
with open('novel.txt', 'a') as f:
f.write(chapter_title)
f.write(content + '\n')
print(f"Chapter: '{title}' written to file.")
get_data(next_id, pages-1)
curr_id = '324909'
get_data(curr_id, 3)
Chapter: 'Heavenly Wolf Valley' written to file.
Chapter: 'Leaving' written to file.
Chapter: 'Pure Fabrication' written to file.
import requests
from bs4 import BeautifulSoup
import re
links = ["https://bitcointalk.org/index.php?board=159.0",
"https://bitcointalk.org/index.php?board=159.40",
"https://bitcointalk.org/index.php?board=159.80"]
def get_span():
for url in links:
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")
t1 = str(soup.findAll("span", id=re.compile('^msg_')))
print(t1)
t2 = [x for x in re.findall(r'\d+\.\d+', t1)]
t2.sort(key=float, reverse=True)
t3 = "https://bitcointalk.org/index.php?topic"
for hn in t2:
if len(hn) >= 9:
hn = '{}={}'.format(t3, hn)
print(hn)
get_span()
Hello!
My code iterates items in link, then finds the span with
id=msg_, then finds all numbers in id=msg_, sorts them in
descending order. Problem is that it iterates first item and gives output
of it, then second item and so on, so out put contains 3 lists. So it
sorts items separately.. I want to get a output with all 3 items from links
sorted in one list.
You can use list.extend to add items to list and then sort the final list before returning it.
For example:
import re
import requests
from bs4 import BeautifulSoup
links = ["https://bitcointalk.org/index.php?board=159.0",
"https://bitcointalk.org/index.php?board=159.40",
"https://bitcointalk.org/index.php?board=159.80"]
def get_span(links):
rv = []
r = re.compile(r'\d{7,}\.\d+')
for url in links:
soup = BeautifulSoup(requests.get(url).content, "html.parser")
rv.extend(a['href'] for a in soup.select('span[id^="msg_"] > a') if r.search(a['href']))
return sorted(rv, key=lambda k: float(r.search(k).group(0)), reverse=True)
all_links = get_span(links)
# print links on screen:
for link in all_links:
print(link)
Prints:
https://bitcointalk.org/index.php?topic=5255494.0
https://bitcointalk.org/index.php?topic=5255416.0
https://bitcointalk.org/index.php?topic=5255389.0
https://bitcointalk.org/index.php?topic=5255376.0
https://bitcointalk.org/index.php?topic=5255316.0
https://bitcointalk.org/index.php?topic=5254720.0
https://bitcointalk.org/index.php?topic=5254480.0
https://bitcointalk.org/index.php?topic=5254448.0
https://bitcointalk.org/index.php?topic=5254287.0
https://bitcointalk.org/index.php?topic=5252504.0
https://bitcointalk.org/index.php?topic=5251621.0
https://bitcointalk.org/index.php?topic=5250998.0
https://bitcointalk.org/index.php?topic=5250388.0
https://bitcointalk.org/index.php?topic=5250185.0
https://bitcointalk.org/index.php?topic=5248406.0
https://bitcointalk.org/index.php?topic=5247112.0
... and so on.
EDIT: If you want to show link text n
ext to url, you can use this example:
import re
import requests
from bs4 import BeautifulSoup
links = ["https://bitcointalk.org/index.php?board=159.0",
"https://bitcointalk.org/index.php?board=159.40",
"https://bitcointalk.org/index.php?board=159.80"]
def get_span(links):
rv = []
r = re.compile(r'\d{7,}\.\d+')
for url in links:
soup = BeautifulSoup(requests.get(url).content, "html.parser")
rv.extend((a['href'], a.text) for a in soup.select('span[id^="msg_"] > a') if r.search(a['href']))
return sorted(rv, key=lambda k: float(r.search(k[0]).group(0)), reverse=True)
all_links = get_span(links)
# print links on screen:
for link, text in all_links:
print('{} {}'.format(link, text))
Prints:
https://bitcointalk.org/index.php?topic=5255494.0 NUL Token - A new hyper-deflationary experiment! Airdrop!
https://bitcointalk.org/index.php?topic=5255416.0 KEEP NETWORK - A privacy layer for Ethereum
https://bitcointalk.org/index.php?topic=5255389.0 [ANN] ICO - OBLICHAIN | Blockchain technology at the service of creative genius
https://bitcointalk.org/index.php?topic=5255376.0 UniChain - The 4th Generation Blockchain Made For The Smart Society 5.0
https://bitcointalk.org/index.php?topic=5255316.0 INFINITE RICKS ! First Multiverse Cryptocurrency ! PoS 307%
https://bitcointalk.org/index.php?topic=5254720.0 [GMC] GameCredits - Unofficial & Unmoderated for Censored Posts.
https://bitcointalk.org/index.php?topic=5254480.0 [ANN] [BTCV] Bitcoin VaultA higher standard in security
https://bitcointalk.org/index.php?topic=5254448.0 [ANN] Silvering (SLVG) token - New Silver Asset Backed Cryptocurrency
... and so on.
I have written some code that help me scrape websites. It has worked well on some sites but I am currently running into an issue.
The collectData() function collects data from a site and appends it to 'dataList'. From this dataList I can create a csv file to export the data.
The issue I am having right now is that the function appends multiple whitespances and \n characters into my list. The output look like this: (the excessive whitespaces are not shown here)
dataList = ['\n 2.500.000 ']
Does anyone what what could cause this? As I mentioned, there are some websites where the function works fine.
Thank you!
def scrape():
dataList = []
pageNr = range(0, 1)
for page in pageNr:
pageUrl = ('https://www.example.com/site:{}'.format(page))
print(pageUrl)
def getUrl(pageUrl):
openUrl = urlopen(pageUrl)
soup = BeautifulSoup(openUrl, 'lxml')
links = soup.find_all('a', class_="ellipsis")
for link in links:
linkNew = link.get('href')
linkList.append(linkNew)
#print(linkList)
return linkList
anzList = getUrl(pageUrl)
lenght = len(anzList)
print(lenght)
anzLinks = []
for i in range(lenght):
anzeigenLinks.append('https://www.example.com/ + anzList[i]')
print(anzLinks)
def collectData():
for link in anzLinks:
openAnz = urlopen(link)
soup = BeautifulSoup(openAnz, 'lxml')
try:
kaufpreisSuche = soup.find('h2')
kaufpreis = kaufpreisSuche.text
dataListe.append(kaufpreis)
print(kaufpreis)
except:
kaufpreis = None
dataListe.append(kaufpreis)
focus_Search = raw_input("Focus Search ")
url = "https://www.google.com/search?q="
res = requests.get(url + focus_Search)
print("You Just Searched")
res_String = res.text
#Now I must get ALL the sections of code that start with "<a href" and end with "/a>"
Im trying to scrape all the links from a google search webpage. I could extract each link one at a time but I'm sure theres a better way to do it.
This creates a list of all links in the search page with some of your code, without getting into BeautifulSoup
import requests
import lxml.html
focus_Search = input("Focus Search ")
url = "https://www.google.com/search?q="
#focus_Search
res = requests.get(url + focus_Search).content
# res
dom = lxml.html.fromstring(res)
links = [x for x in dom.xpath('//a/#href')] # Borrows from cheekybastard in link below
# http://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
links
I scraping a site with Beautiful Soup. The problem I have is that certain parts of the site are paginated with JS, with an unknown (varying) number of pages to scrape.
I'm trying to get around this with a generator, but it's my first time writing one and I'm having a hard time wrapping my head around it and figuring out if what I'm doing makes sense.
Code:
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
tlds = csv.reader(open("top_level_domains.csv", 'r'), delimiter=';')
sites = csv.writer(open("websites_to_scrape.csv", "w"), delimiter=',')
tld = "uz"
has_next = True
page = 0
def create_link(tld, page):
if page == 0:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain"
else:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain/page/" + repr(page)
return link
def check_for_next(soup):
disabled_nav = soup.find(class_="pagingDivDisabled")
if disabled_nav:
if "Next" in disabled_nav:
return False
else:
return True
else:
return True
def make_soup(link):
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
return soup
def all_the_pages(counter):
while True:
link = create_link(tld, counter)
soup = make_soup(link)
if check_for_next(soup) == True:
yield counter
else:
break
counter += 1
def scrape_page(soup):
table = soup.find('table', {'class': 'rankTable'})
th = table.find('tbody')
test = th.find_all("td")
correct_cells = range(1,len(test),3)
for cell in correct_cells:
#print test[cell]
url = repr(test[cell])
content = re.sub("<[^>]*>", "", url)
sites.writerow([tld]+[content])
def main():
for page in all_the_pages(0):
print page
link = create_link(tld, page)
print link
soup = make_soup(link)
scrape_page(soup)
main()
My thinking behind the code:
The scraper should get the page, determine if there is another page that follows, scrape the current page and move to the next one, repreating the process. If there is no next page, it should stop. Does that make sense how I'm going it here?
As I told you, you could use selenium for programmatically clicking on the Next button, but since that is not an option for you, I can think of the following method to get the number of pages using pure BS4:
import requests
from bs4 import BeautifulSoup
def page_count():
pages = 1
url = "https://domaintyper.com/top-websites/most-popular-websites-with-uz-domain/page/{}"
while True:
html = requests.get(url.format(pages)).content
soup = BeautifulSoup(html)
table = soup.find('table', {'class': 'rankTable'})
if len(table.find_all('tr')) <= 1:
return pages
pages += 1