Python Web crawler, crawl through links and find specific words

Python Web crawler, crawl through links and find specific words - python

So I am trying to code a web crawler that goes into a each chapter of a title for a Statue and count occurrence of a set a key words ("shall" "must") in its content.
Below is the code i used to acquire links to each chapters.
The base URL I used is http://law.justia.com/codes/georgia/2015/
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
from collections import Counter
pattern1 = re.compile(r"\bshall\b",re.IGNORECASE)
pattern2 = re.compile(r"\bmust\b",re.IGNORECASE)
########################################Sections##########################
def levelthree(item2_url):
r = requests.get(item2_url)
for sectionlinks in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if sectionlinks.has_attr('href'):
if 'section' in sectionlinks['href']:
href = "http://law.justia.com" + sectionlinks.get('href')
href = "\n" + str(href)
print (href)
########################################Chapters##########################
def leveltwo(item_url):
r = requests.get(item_url)
for sublinks in BeautifulSoup((r.content), "html.parser", parse_only=SoupStrainer('a')):
if sublinks.has_attr('href'):
if 'chapt' in sublinks['href']:
chapterlinks = "http://law.justia.com" + sublinks.get('href')
# chapterlinks = "\n" + str(chapterlinks)
#print (chapterlinks)
######################################Titles###############################
def levelone(url):
r = requests.get(url)
for links in BeautifulSoup((r.content), "html.parser", parse_only=SoupStrainer('a')):
if links.has_attr('href'):
if 'title-43' in links['href']:
titlelinks = "http://law.justia.com" + links.get('href')
# titlelinks = "\n" + str(titlelinks)
leveltwo(titlelinks)
# print (titlelinks)
###########################################################################
base_url = "http://law.justia.com/codes/georgia/2015/"
levelone(base_url)
The problem is that the structure of the page are usually title - chapter - sections - contents ( ex: http://law.justia.com/codes/georgia/2015/title-43/chapter-1/section-43-1-1/)
But there are ones that are title - chapter - articles - sections - contents (ex.http://law.justia.com/codes/georgia/2015/title-43/chapter-4/article-1/section-43-4-1/ )
I am able to get the links for the first scenario. However, I will miss all the title- chapter - article - sections - contents
My questions is, how can I code this so that I will be able get contents for each chapter (from sections links and from article to sections links) then look for the occurrence of words (such as "shall" or "must") for each chapter individually?
I want to find the word frequency by chapters, hopefully the output will be something like this
chapter 1
Word Frequency
shall 35
must 3
chapter 2
Word Frequency
shall 59
must 14

for this problem, count '/' in urls
http://law.justia.com/codes/georgia/2015/title-43/chapter-1/section-43-1-1/)
http://law.justia.com/codes/georgia/2015/title-43/chapter-4/article-1/section-43-4-1/ )
if url.count('/') == 9:
# do somthing
if url.count('/') == 10:
# do somthing
or you can do a simple trick:
part = url.split('/')
title = part[7]
chapter = part[8]
section = part[-1]
Note: -1 means last part
to count shall or must:
use count function for the same
shall_count = response_text.count('shall')
must_count = response_text.count('must')

Related

Stuck Scraping with Beautifulsoup

So i'm trying to scrape a html webpage. It has novel chapters and i'm trying to get the text and store in text files to read offline. I don't have any previous experience with html or other things either. So the webpage I am trying to scrape is this. And the code i've been testing so far looks like this
`
import sys
import requests
import time
import re
from bs4 import BeautifulSoup
def browse_and_scrape(seed_url, page_number=1):
# Fetch the URL - We will be using this to append to images and info routes
url_pat = re.compile(r"(http://.*\.org)")
source_url = url_pat.search(seed_url).group(0)
# Page_number from the argument gets formatted in the URL & Fetched
formatted_url = seed_url.format(str(page_number))
# print(url_pat,source_url,formatted_url)
try:
html_text = requests.get(formatted_url).text
# print(html_text)
# Prepare the soup
soup = BeautifulSoup(html_text, "html.parser")
print(soup.find_all(id="chapterContent")[0]["style"])
print(f"Now Scraping - {formatted_url}")
# help = soup.find_all("div",class_="chapter-content text-normal")[0].text.strip().encode("ascii", "ignore").decode("ascii")
# for node in soup.findAll("div",class_="chapter-content text-normal"):
# print(node)
# print(''.join(node.findAll(text=True)))
# for node in soup.findAll("div"):
# # print(node)
# print(''.join(node.findAll(text=True)))
# help = soup.find_all("div",class_="chapter-content text-normal")[0]
# print(''.join(help.findAll(text=True)))
# print(help)
except Exception as e:
return e
return true
if __name__ == "__main__":
# seed_url = "http://books.toscrape.com/catalogue/page-{}.html"
seed_url = "http://wnmtl.org/chapter/324909-heavenly-wolf-valley.html"
# seed_url = "http://wnmtl.org/chapter/{}.html"
print("Web scraping has begun")
result = browse_and_scrape(seed_url)
if result == True:
print("Web scraping is now complete!")
else:
print(f"Oops, That doesn't seem right!!! - {result}")`
All the commented stuff are things i've been trying to rip the text from the tag. From my inspection of the developer console in the browser, all the text is in the tag with id of chapter content. My plan is to iteratively get the text, stuff it, get the link for the next page and repeat but i've been stuck for a bit now, any suggestions.

Instead of scraping each page, you can directly get the text from this API endpoint using requests.
https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/324909
The last item in the above API is the chapter ID (324909). You can navigate to chapters by giving in the chapter IDs.
The next and prev chapter IDs are present in the current chapter's API endpoint. Have a look at the above URL in browser to understand it better.
Here is the full recursive code that writes the text from 3 pages to a file called novel.txt. You may change the number of pages and other details as per your need.
import requests
def get_data(chapter_id, pages):
if pages == 0:
return
url = 'https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/' + str(chapter_id)
r = requests.get(url)
x = r.json()
pre_id = x['data']['preId']
next_id = x['data']['nextId']
title = x['data']['title']
content = x['data']['content']
chapter_title = f'\n***** Chapter: {title} *****\n'
with open('novel.txt', 'a') as f:
f.write(chapter_title)
f.write(content + '\n')
print(f"Chapter: '{title}' written to file.")
get_data(next_id, pages-1)
curr_id = '324909'
get_data(curr_id, 3)
Chapter: 'Heavenly Wolf Valley' written to file.
Chapter: 'Leaving' written to file.
Chapter: 'Pure Fabrication' written to file.

How can I get data from a website using BeautifulSoup and requests?

I am a beginner in web scraping, and I need help with this problem.
The website, allrecipes.com, is a website where you can find recipes based on a search, which in this case is 'pie':
link to the html file:
'view-source:https://www.allrecipes.com/search/results/?wt=pie&sort=re'
(right click-> view page source)
I want to create a program that takes a input, searches it up on allrecipes, and returns a list with tuples of the first five recipes with data such as the time that takes to make, serving yield, ingrediants, and more.
This is my program so far:
import requests
from bs4 import BeautifulSoup
def searchdata():
inp=input('what recipe would you like to search')
url ='http://www.allrecipes.com/search/results/?wt='+str(inp)+'&sort=re'
r=requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links=[]
#fill in code for finding top 3 or five links
for i in range(3)
a = requests.get(links[i])
soupa = BeautifulSoup(a.text, 'html.parser')
#fill in code to find name, ingrediants, time, and serving size with data from soupa
names=[]
time=[]
servings=[]
ratings=[]
ingrediants=[]
searchdata()
Yes, i know, my code is very messy but What should I fill in in the two code fill-in areas?
Thanks

After searching for the recipe you have to get the links of each recipe and then request again for each of those links, because the information you're looking for is not available on the search page. That would not look clean without OOP so here's the class I wrote that does what you want.
import requests
from time import sleep
from bs4 import BeautifulSoup
class Scraper:
links = []
names = []
def get_url(self, url):
url = requests.get(url)
self.soup = BeautifulSoup(url.content, 'html.parser')
def print_info(self, name):
self.get_url(f'https://www.allrecipes.com/search/results/?wt={name}&sort=re')
if self.soup.find('span', class_='subtext').text.strip()[0] == '0':
print(f'No recipes found for {name}')
return
results = self.soup.find('section', id='fixedGridSection')
articles = results.find_all('article')
texts = []
for article in articles:
txt = article.find('h3', class_='fixed-recipe-card__h3')
if txt:
if len(texts) < 5:
texts.append(txt)
else:
break
self.links = [txt.a['href'] for txt in texts]
self.names = [txt.a.span.text for txt in texts]
self.get_data()
def get_data(self):
for i, link in enumerate(self.links):
self.get_url(link)
print('-' * 4 + self.names[i] + '-' * 4)
info_names = [div.text.strip() for div in self.soup.find_all(
'div', class_='recipe-meta-item-header')]
ingredient_spans = self.soup.find_all('span', class_='ingredients-item-name')
ingredients = [span.text.strip() for span in ingredient_spans]
for i, div in enumerate(self.soup.find_all('div', class_='recipe-meta-item-body')):
print(info_names[i].capitalize(), div.text.strip())
print()
print('Ingredients'.center(len(ingredients[0]), ' '))
print('\n'.join(ingredients))
print()
print('*' * 50, end='\n\n')
chrome = Scraper()
chrome.print_info(input('What recipe would you like to search: '))

Trying to crawl all newslinks in a site (the parsed link only shows 10 results per page, I would need to find ALL links)

I am trying to crawl all news link that has a certain keyword that is looking for.
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re
key_word = urllib.parse.quote("금리")
url = "https://search.naver.com/search.naver?where=news&query=" + key_word +"%EA%B8%88%EB%A6%AC&sm=tab_opt&sort=0&photo=0&field=0&reporter_article=&pd=3&ds=2020.04.13&de=2020.04.14&docid=&nso=so%3Ar%2Cp%3Afrom20200413to20200414%2Ca%3Aall&mynews=0&refresh_start=0&related=0"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
anchor_set = soup.findAll('a')
news_link = []
for a in anchor_set:
if str(a).find('https://news.naver.com/main/read.nhn?') != -1:
a = a.get('href')
news_link.append(a)'
Untill this section (code above), I parse into the url and retrieve all links that has a certian read.nhn(naver news platform) and append it to news_link.
This is working fine, but the proble is the url used above only shows 10 articles in the page.
count_tag = soup.find("div",{"class","title_desc all_my"})
count_text=count_tag.find("span").get_text().split()
total_num=count_text[-1][0:-1].replace(",","")
print(total_num)'
Using the code above I've found out there are a total of 1297 articles that I need to collect. but since the original link above only has 10 articles in the page.
for val in range(int(total_num)//10+1):
start_val=str(val*10+1)
I was told i needed to insert this into the url to retrieve ALL newslinks.
Thus, I've used the while method
while start_val <= total_num:
url = "https://search.naver.com/search.naver?where=news&query=" + key_word +"%EA%B8%88%EB%A6%AC&sm=tab_opt&sort=0&photo=0&field=0&reporter_article=&pd=3&ds=2020.04.13&de=2020.04.14&docid=&nso=so%3Ar%2Cp%3Afrom20200413to20200414%2Ca%3Aall&mynews=0&refresh_start=" + start_val + "&related=0"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
news_link = []
anchor_set = soup.findAll('a')
for a in anchor_set:
if str(a).find('https://news.naver.com/main/read.nhn?') != -1:
a = a.get('href')
news_link.append(a)
However, when I run the program, it seems the loop does not stop. obviously there is no else or break.. How can i break this loop and successfully collect all the links?

Your current while loop doesn't stop because you haven't incremented the value of start_val. Also, later you have range(int(total_num)//10+1) so if you converted total_num to a string, then the string comparison in while start_val <= total_num is wrong - for strings "21" > "1297", because "2" > "1". Compare them as int's.
And since you're creating the sequence of vals to use, you don't need a separate upper bound check.
So far, this would give you the correct finite loop:
for val in range(int(total_num)//10+1): # no upper bound check needed
start_val=str(val*10+1)
url = "https://search.naver.com/search.naver?where=news&query=" ...
html = urllib.request.urlopen(url).read()
...
For the values needed for the pages/next starting item, instead of doing:
for val in range(int(total_num)//10+1):
start_val = str(val*10+1)
You can get the actual val's from range(). To starting at 1 and going in steps of 10 to get: 1, 11, 21, ... , upto and including the total:
for val in range(1, total_num + 1, 10):
start_val = str(val) # don't need this assignment actually
Next thing: the URL for page 2 onwards is wrong. Currently, your while loop will generate the following URL for page 2:
https://search.naver.com/search.naver?where=news&query=%EA%B8%88%EB%A6%AC%EA%B8%88%EB%A6%AC&sm=tab_opt&sort=0&photo=0&field=0&reporter_article=&pd=3&ds=2020.04.13&de=2020.04.14&docid=&nso=so%3Ar%2Cp%3Afrom20200413to20200414%2Ca%3Aall&mynews=0&refresh_start=11&related=0
But if you click on page "2" of the results, you get the URL:
https://search.naver.com/search.naver?&where=news&query=%EA%B8%88%EB%A6%AC%EA%B8%88%EB%A6%AC&sm=tab_pge&sort=0&photo=0&field=0&reporter_article=&pd=3&ds=2020.04.13&de=2020.04.14&docid=&nso=so:r,p:from20200413to20200414,a:all&mynews=0&cluster_rank=35&start=11&refresh_start=0
The main difference is at the end: &refresh_start=11 in yours vs &start=11&refresh_start=0 actual. Since that format also works for page 1 (just checked), use that instead.
You have some extra characters in the section after the keyword: ...&query=" + key_word +"%EA%B8%88%EB%A6%AC&sm=tab_opt. That %EA%B8%88%EB%A6%AC is from your previous search keyword.
You can also skip several unneeded URL parameters, by testing which are actually not needed.
Putting all that together:
for val in range(1, total_num + 1, 10):
start_val = str(val)
url = ("https://search.naver.com/search.naver?&where=news&query=" +
key_word +
"&sm=tab_pge&sort=0&photo=0&field=0&reporter_article=&pd=3&ds=2020.04.13&de=2020.04.14" +
"&docid=&nso=so:r,p:from20200413to20200414,a:all&mynews=0&cluster_rank=51" +
"&refresh_start=0&start=" +
start_val)
html = urllib.request.urlopen(url).read()
... # etc.

Unable to scrape the conversation among debaters in order to put them in a dictionary

I've created a script to fetch all the conversation between different debaters excluding moderators. What I've written so far can fetch the total conversation. However, I would like to grab them like {speaker_name: (first speech, second speech) etc }.
Webpage link
another one similar to the above link
webpage link
I've tried so far:
import requests
from bs4 import BeautifulSoup
url = 'https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas'
def get_links(link):
r = requests.get(link)
soup = BeautifulSoup(r.text,"lxml")
for item in soup.select(".field-docs-content p:has( > strong:contains('MODERATOR:')) ~ p"):
print(item.text)
if __name__ == '__main__':
get_links(url)
How can I scrape the conversation among debaters and put them in a dictionary?

I don't hold much hope for this lasting across lots of pages given the variability amongst the two pages I saw and the number of assumptions I have had to make. Essentially, I use regex on participant and moderators nodes text to isolate the lists of moderators and participants. I then loop all speech paragraphs and each time I encounter a moderator at the start of a paragraph I set a boolean variable store_paragraph = False and ignore subsequent paragraphs; likewise, each time I encounter a participant, I set store_paragraph = True and store that paragraph and subsequent ones under the appropriate participant key in my speaker_dict. I store each speaker_dict in a final results dictionary.
import requests, re
from bs4 import BeautifulSoup as bs
import pprint
links = ['https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas','https://www.presidency.ucsb.edu/documents/republican-presidential-candidates-debate-manchester-new-hampshire-0']
results = {}
p = re.compile(r'\b(\w+)\b\s+\(|\b(\w+)\b,')
with requests.Session() as s:
for number, link in enumerate(links):
r = s.get(link)
soup = bs(r.content,'lxml')
participants_tag = soup.select_one('p:has(strong:contains("PARTICIPANTS:"))')
if participants_tag.select_one('strong'):
participants_tag.strong.decompose()
speaker_dict = {i[0].upper() + ':' if i[0] else i[1].upper() + ':': [] for string in participants_tag.stripped_strings for i in p.findall(string)}
# print(speaker_dict)
moderator_data = [string for string in soup.select_one('p:has(strong:contains("MODERATOR:","MODERATORS:"))').stripped_strings][1:]
#print(moderator_data)
moderators = [i[0].upper() + ':' if i[0] else i[1].upper() + ':' for string in moderator_data for i in p.findall(string)]
store_paragraph = False
for paragraph in soup.select('.field-docs-content p:not(p:contains("PARTICIPANTS:","MODERATOR:"))')[1:]:
string_to_compare = paragraph.text.split(':')[0] + ':'
string_to_compare = string_to_compare.upper()
if string_to_compare in moderators:
store_paragraph = False
elif string_to_compare in speaker_dict:
speaker = string_to_compare
store_paragraph = True
if store_paragraph:
speaker_dict[speaker].append(paragraph.text)
results[number] = speaker_dict
pprint.pprint(results[1])

Following links in Python

I have to write a program that will read the HTML from this link(http://python-data.dr-chuck.net/known_by_Maira.html), extract the href= values from the anchor tags, scan for a tag that is in a particular position relative to the first name in the list, follow that link and repeat the process a number of times and report the last name you find.
I am supposed to find the link at position 18 (the first name is 1), follow that link and repeat this process 7 times. The answer is the last name that I retrieve.
Here is the code I found and it works just fine.
import urllib
from BeautifulSoup import *
url = raw_input("Enter URL: ")
count = int(raw_input("Enter count: "))
position = int(raw_input("Enter position: "))
names = []
while count > 0:
print "retrieving: {0}".format(url)
page = urllib.urlopen(url)
soup = BeautifulSoup(page)
tag = soup('a')
name = tag[position-1].string
names.append(name)
url = tag[position-1]['href']
count -= 1
print names[-1]
I would really appreciate if someone could explain to me like you would to a 10 year old, what's going on inside the while loop. I am new to Python and would really appreciate the guidance.

while count > 0: # because of `count -= 1` below,
# will run loop count times
print "retrieving: {0}".format(url) # just prints out the next web page
# you are going to get
page = urllib.urlopen(url) # urls reference web pages (well,
# many types of web content but
# we'll stick with web pages)
soup = BeautifulSoup(page) # web pages are frequently written
# in html which can be messy. this
# package "unmessifies" it
tag = soup('a') # in html you can highlight text and
# reference other web pages with <a>
# tags. this get all of the <a> tags
# in a list
name = tag[position-1].string # This gets the <a> tag at position-1
# and then gets its text value
names.append(name) # this puts that value in your own
# list.
url = tag[position-1]['href'] # html tags can have attributes. On
# and <a> tag, the href="something"
# attribute references another web
# page. You store it in `url` so that
# its the page you grab on the next
# iteration of the loop.
count -= 1

You enter the number of urls you want to retrieve from a page
0) prints url
1) opens url
2) reads source
BeautifulSoup docs
3) gets every a tag
4) gets the whole <a ...></a> I think
5) adds it to a list names
6) gets url from the last item of names, ie pulls href from <a ...></a>
7) prints the last of the list names

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
total=0
url = input('Enter - ')
c=input('enter count-')
count=int(c)
p=input('enter position-')
pos=int(p)
while total<=count:
html = urllib.request.urlopen(url, context=ctx).read()
print("Retrieving",url)
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
counter=0
for tag in tags:
counter=counter+1
if(counter<=pos):
x=tag.get('href',None)
url=x
else:
break
total=total+1

Solution with explanations.
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
url = input('Enter - ')
count = int(input('Enter count: '))
position = int(input ('Enter position: '))
names = []
while count > 0:
print('Retrieving: {}'.format(url))
html = urllib.request.urlopen(url) # open the url using urllib
soup = BeautifulSoup(html, 'html.parser')# parse html data in a clean format
# Retrieve all of the anchor tags
tags = soup('a')
# This gets the <a> tag at position-1 and then gets its text value
name = tags[position-1].string
names.append(name) #add the name to our list
url = tags[position-1]['href']#retrieve the url for next iteratopn
count -= 1
print(names)
print('Answer: ',names[count-1])
Hope it helps.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Web crawler, crawl through links and find specific words - python

Related

Stuck Scraping with Beautifulsoup

How can I get data from a website using BeautifulSoup and requests?

Trying to crawl all newslinks in a site (the parsed link only shows 10 results per page, I would need to find ALL links)

Unable to scrape the conversation among debaters in order to put them in a dictionary

Following links in Python

Categories

Resources