Python Link Scraper - python

focus_Search = raw_input("Focus Search ")
url = "https://www.google.com/search?q="
res = requests.get(url + focus_Search)
print("You Just Searched")
res_String = res.text
#Now I must get ALL the sections of code that start with "<a href" and end with "/a>"
Im trying to scrape all the links from a google search webpage. I could extract each link one at a time but I'm sure theres a better way to do it.

This creates a list of all links in the search page with some of your code, without getting into BeautifulSoup
import requests
import lxml.html
focus_Search = input("Focus Search ")
url = "https://www.google.com/search?q="
#focus_Search
res = requests.get(url + focus_Search).content
# res
dom = lxml.html.fromstring(res)
links = [x for x in dom.xpath('//a/#href')] # Borrows from cheekybastard in link below
# http://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
links

Related

Stuck Scraping with Beautifulsoup

So i'm trying to scrape a html webpage. It has novel chapters and i'm trying to get the text and store in text files to read offline. I don't have any previous experience with html or other things either. So the webpage I am trying to scrape is this. And the code i've been testing so far looks like this
`
import sys
import requests
import time
import re
from bs4 import BeautifulSoup
def browse_and_scrape(seed_url, page_number=1):
# Fetch the URL - We will be using this to append to images and info routes
url_pat = re.compile(r"(http://.*\.org)")
source_url = url_pat.search(seed_url).group(0)
# Page_number from the argument gets formatted in the URL & Fetched
formatted_url = seed_url.format(str(page_number))
# print(url_pat,source_url,formatted_url)
try:
html_text = requests.get(formatted_url).text
# print(html_text)
# Prepare the soup
soup = BeautifulSoup(html_text, "html.parser")
print(soup.find_all(id="chapterContent")[0]["style"])
print(f"Now Scraping - {formatted_url}")
# help = soup.find_all("div",class_="chapter-content text-normal")[0].text.strip().encode("ascii", "ignore").decode("ascii")
# for node in soup.findAll("div",class_="chapter-content text-normal"):
# print(node)
# print(''.join(node.findAll(text=True)))
# for node in soup.findAll("div"):
# # print(node)
# print(''.join(node.findAll(text=True)))
# help = soup.find_all("div",class_="chapter-content text-normal")[0]
# print(''.join(help.findAll(text=True)))
# print(help)
except Exception as e:
return e
return true
if __name__ == "__main__":
# seed_url = "http://books.toscrape.com/catalogue/page-{}.html"
seed_url = "http://wnmtl.org/chapter/324909-heavenly-wolf-valley.html"
# seed_url = "http://wnmtl.org/chapter/{}.html"
print("Web scraping has begun")
result = browse_and_scrape(seed_url)
if result == True:
print("Web scraping is now complete!")
else:
print(f"Oops, That doesn't seem right!!! - {result}")`
All the commented stuff are things i've been trying to rip the text from the tag. From my inspection of the developer console in the browser, all the text is in the tag with id of chapter content. My plan is to iteratively get the text, stuff it, get the link for the next page and repeat but i've been stuck for a bit now, any suggestions.
Instead of scraping each page, you can directly get the text from this API endpoint using requests.
https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/324909
The last item in the above API is the chapter ID (324909). You can navigate to chapters by giving in the chapter IDs.
The next and prev chapter IDs are present in the current chapter's API endpoint. Have a look at the above URL in browser to understand it better.
Here is the full recursive code that writes the text from 3 pages to a file called novel.txt. You may change the number of pages and other details as per your need.
import requests
def get_data(chapter_id, pages):
if pages == 0:
return
url = 'https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/' + str(chapter_id)
r = requests.get(url)
x = r.json()
pre_id = x['data']['preId']
next_id = x['data']['nextId']
title = x['data']['title']
content = x['data']['content']
chapter_title = f'\n***** Chapter: {title} *****\n'
with open('novel.txt', 'a') as f:
f.write(chapter_title)
f.write(content + '\n')
print(f"Chapter: '{title}' written to file.")
get_data(next_id, pages-1)
curr_id = '324909'
get_data(curr_id, 3)
Chapter: 'Heavenly Wolf Valley' written to file.
Chapter: 'Leaving' written to file.
Chapter: 'Pure Fabrication' written to file.

How can I get data from a website using BeautifulSoup and requests?

I am a beginner in web scraping, and I need help with this problem.
The website, allrecipes.com, is a website where you can find recipes based on a search, which in this case is 'pie':
link to the html file:
'view-source:https://www.allrecipes.com/search/results/?wt=pie&sort=re'
(right click-> view page source)
I want to create a program that takes a input, searches it up on allrecipes, and returns a list with tuples of the first five recipes with data such as the time that takes to make, serving yield, ingrediants, and more.
This is my program so far:
import requests
from bs4 import BeautifulSoup
def searchdata():
inp=input('what recipe would you like to search')
url ='http://www.allrecipes.com/search/results/?wt='+str(inp)+'&sort=re'
r=requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links=[]
#fill in code for finding top 3 or five links
for i in range(3)
a = requests.get(links[i])
soupa = BeautifulSoup(a.text, 'html.parser')
#fill in code to find name, ingrediants, time, and serving size with data from soupa
names=[]
time=[]
servings=[]
ratings=[]
ingrediants=[]
searchdata()
Yes, i know, my code is very messy but What should I fill in in the two code fill-in areas?
Thanks
After searching for the recipe you have to get the links of each recipe and then request again for each of those links, because the information you're looking for is not available on the search page. That would not look clean without OOP so here's the class I wrote that does what you want.
import requests
from time import sleep
from bs4 import BeautifulSoup
class Scraper:
links = []
names = []
def get_url(self, url):
url = requests.get(url)
self.soup = BeautifulSoup(url.content, 'html.parser')
def print_info(self, name):
self.get_url(f'https://www.allrecipes.com/search/results/?wt={name}&sort=re')
if self.soup.find('span', class_='subtext').text.strip()[0] == '0':
print(f'No recipes found for {name}')
return
results = self.soup.find('section', id='fixedGridSection')
articles = results.find_all('article')
texts = []
for article in articles:
txt = article.find('h3', class_='fixed-recipe-card__h3')
if txt:
if len(texts) < 5:
texts.append(txt)
else:
break
self.links = [txt.a['href'] for txt in texts]
self.names = [txt.a.span.text for txt in texts]
self.get_data()
def get_data(self):
for i, link in enumerate(self.links):
self.get_url(link)
print('-' * 4 + self.names[i] + '-' * 4)
info_names = [div.text.strip() for div in self.soup.find_all(
'div', class_='recipe-meta-item-header')]
ingredient_spans = self.soup.find_all('span', class_='ingredients-item-name')
ingredients = [span.text.strip() for span in ingredient_spans]
for i, div in enumerate(self.soup.find_all('div', class_='recipe-meta-item-body')):
print(info_names[i].capitalize(), div.text.strip())
print()
print('Ingredients'.center(len(ingredients[0]), ' '))
print('\n'.join(ingredients))
print()
print('*' * 50, end='\n\n')
chrome = Scraper()
chrome.print_info(input('What recipe would you like to search: '))

Python index function

I am writing a simple Python program which grabs a webpage and finds all the URL links in it. However I try to index the starting and ending delimiter (") of each href link but the ending one always indexed wrong.
# open a url and find all the links in it
import urllib2
url=urllib2.urlopen('right.html')
urlinfo = url.info()
urlcontent = url.read()
bodystart = urlcontent.index('<body')
print 'body starts at',bodystart
bodycontent = urlcontent[bodystart:].lower()
print bodycontent
linklist = []
n = bodycontent.index('<a href=')
while n:
print n
bodycontent = bodycontent[n:]
a = bodycontent.index('"')
b = bodycontent[(a+1):].index('"')
print a, b
linklist.append(bodycontent[(a+1):b])
n = bodycontent[b:].index('<a href=')
print linklist
I would suggest using a html parsing library instead of manually searching the DOM String.
Beautiful Soup is an excellent library for this purpose. Here is the reference link
With bs your link searching functionality could look like:
from bs4 import BeautifulSoup
soup = BeautifulSoup(bodycontent, 'html.parser')
linklist = [a.get('href') for a in soup.find_all('a')]

Scrape page with generator

I scraping a site with Beautiful Soup. The problem I have is that certain parts of the site are paginated with JS, with an unknown (varying) number of pages to scrape.
I'm trying to get around this with a generator, but it's my first time writing one and I'm having a hard time wrapping my head around it and figuring out if what I'm doing makes sense.
Code:
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
tlds = csv.reader(open("top_level_domains.csv", 'r'), delimiter=';')
sites = csv.writer(open("websites_to_scrape.csv", "w"), delimiter=',')
tld = "uz"
has_next = True
page = 0
def create_link(tld, page):
if page == 0:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain"
else:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain/page/" + repr(page)
return link
def check_for_next(soup):
disabled_nav = soup.find(class_="pagingDivDisabled")
if disabled_nav:
if "Next" in disabled_nav:
return False
else:
return True
else:
return True
def make_soup(link):
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
return soup
def all_the_pages(counter):
while True:
link = create_link(tld, counter)
soup = make_soup(link)
if check_for_next(soup) == True:
yield counter
else:
break
counter += 1
def scrape_page(soup):
table = soup.find('table', {'class': 'rankTable'})
th = table.find('tbody')
test = th.find_all("td")
correct_cells = range(1,len(test),3)
for cell in correct_cells:
#print test[cell]
url = repr(test[cell])
content = re.sub("<[^>]*>", "", url)
sites.writerow([tld]+[content])
def main():
for page in all_the_pages(0):
print page
link = create_link(tld, page)
print link
soup = make_soup(link)
scrape_page(soup)
main()
My thinking behind the code:
The scraper should get the page, determine if there is another page that follows, scrape the current page and move to the next one, repreating the process. If there is no next page, it should stop. Does that make sense how I'm going it here?
As I told you, you could use selenium for programmatically clicking on the Next button, but since that is not an option for you, I can think of the following method to get the number of pages using pure BS4:
import requests
from bs4 import BeautifulSoup
def page_count():
pages = 1
url = "https://domaintyper.com/top-websites/most-popular-websites-with-uz-domain/page/{}"
while True:
html = requests.get(url.format(pages)).content
soup = BeautifulSoup(html)
table = soup.find('table', {'class': 'rankTable'})
if len(table.find_all('tr')) <= 1:
return pages
pages += 1

Adding counter to my Python Web Crawler

I've made a web crawler which gives a link and text from link for all sites in given addres it looks like this:
import urllib
from bs4 import BeautifulSoup
import urlparse
import mechanize
url = ["http://adbnews.com/area51"]
for u in url:
br = mechanize.Browser()
urls = [u]
visited = [u]
i = 0
while i<len(urls):
try:
br.open(urls[0])
urls.pop(0)
for link in br.links():
levelLinks = []
linkText = []
newurl = urlparse.urljoin(link.base_url, link.url)
b1 = urlparse.urlparse(newurl).hostname
b2 = urlparse.urlparse(newurl).path
newurl = "http://"+b1+b2
linkTxt = link.text
linkText.append(linkTxt)
levelLinks.append(newurl)
if newurl not in visited and urlparse.urlparse(u).hostname in newurl:
urls.append(newurl)
visited.append(newurl)
#print newurl
#get Mechanize Links
for l,lt in zip(levelLinks,linkText):
print newurl,"\n",lt,"\n"
except:
urls.pop(0)
it gets results like that:
http://www.adbnews.com/area51/contact.html
CONTACT
http://www.adbnews.com/area51/about.html
ABOUT
http://www.adbnews.com/area51/index.html
INDEX
http://www.adbnews.com/area51/1st/
FIRST LEVEL!
http://www.adbnews.com/area51/1st/bling.html
BLING
http://www.adbnews.com/area51/1st/index.html
INDEX
http://adbnews.com/area51/2nd/
2ND LEVEL
And I wanna add a counter of somekind which could limit how deep crawler goes..
I've tried add for example steps = 3 and change while i<len(urls) in while i<steps:
but that would go only to first level even the number says 3...
Any advice is welcome
If you want to search a certain "depth", consider using a recursive function instead of just appending a list of URL's.
def crawl(url, depth):
if depth <= 3:
#Scan page, grab links, title
for link in links:
print crawl(link, depth + 1)
return url +"\n"+ title
This allows for easier control of your recursive searching, as well as being faster and less resource heavy :)

Categories

Resources