How would i modify the parameters of the findAll method to read both li's and id's? li's are elements and id's are attributes correct?
#Author: David Owens
#File name: soupScraper.py
#Description: html scraper that takes surf reports from various websites
import csv
import requests
from bs4 import BeautifulSoup
###################### SURFLINE URL STRINGS AND TAG ###########################
slRootUrl = 'http://www.surfline.com/surf-report/'
slSunsetCliffs = 'sunset-cliffs-southern-california_4254/'
slScrippsUrl = 'scripps-southern-california_4246/'
slBlacksUrl = 'blacks-southern-california_4245/'
slCardiffUrl = 'cardiff-southern-california_4786/'
slTagText = 'observed-wave-range'
slTag = 'id'
#list of surfline URL endings
slUrls = [slSunsetCliffs, slScrippsUrl, slBlacksUrl, slCardiffUrl]
###############################################################################
#################### MAGICSEAWEED URL STRINGS AND TAG #########################
msRootUrl = 'http://magicseaweed.com/'
msSunsetCliffs = 'Sunset-Cliffs-Surf-Report/4211/'
msScrippsUrl = 'Scripps-Pier-La-Jolla-Surf-Report/296/'
msBlacksUrl = 'Torrey-Pines-Blacks-Beach-Surf-Report/295/'
msTagText = 'rating-text text-dark'
msTag = 'li'
#list of magicseaweed URL endings
msUrls = [msSunsetCliffs, msScrippsUrl, msBlacksUrl]
###############################################################################
'''
This method iterates through a list of urls and extracts the surf report from
the webpage dependent upon its tag location
rootUrl: The root url of each surf website
urlList: A list of specific urls to be appended to the root url for each
break
tag: the html tag where the actual report lives on the page
returns: a list of strings of each breaks surf report
'''
def extract_Reports(rootUrl, urlList, tag, tagText):
#empty list to hold reports
reports = []
#loop thru URLs
for url in urlList:
try:
#request page
request = requests.get(rootUrl + url)
#turn into soup
soup = BeautifulSoup(request.content, 'lxml')
#get the tag where report lives
reportTag = soup.findAll(id = tagText)
for report in reportTag:
reports.append(report.string.strip())
#notify if fail
except:
print 'scrape failure'
pass
return reports
#END METHOD
slReports = extract_Reports(slRootUrl, slUrls, slTag, slTagText)
msReports = extract_Reports(msRootUrl, msUrls, msTag, msTagText)
print slReports
print msReports
As of right now, only slReports prints correctly because i have it explicitly set to id = tagText. I am also aware that my tag paramater is not used currently.
So the problem is that you want to search the parse tree for elements that have either a class name of rating-text (it turns out you do not need text-dark to identify the relevant elements in the case of Magicseaweed) or an ID of observed-wave-range, using a single findAll call.
You can use a filter function to achieve this:
def reportTagFilter(tag):
return (tag.has_attr('class') and 'rating-text' in tag['class']) \
or (tag.has_attr('id') and tag['id'] == 'observed-wave-range')
Then change your extract_Reports function to read:
reportTag = soup.findAll(reportTagFilter)[0]
reports.append(reportTag.text.strip())
Related
So i'm trying to scrape a html webpage. It has novel chapters and i'm trying to get the text and store in text files to read offline. I don't have any previous experience with html or other things either. So the webpage I am trying to scrape is this. And the code i've been testing so far looks like this
`
import sys
import requests
import time
import re
from bs4 import BeautifulSoup
def browse_and_scrape(seed_url, page_number=1):
# Fetch the URL - We will be using this to append to images and info routes
url_pat = re.compile(r"(http://.*\.org)")
source_url = url_pat.search(seed_url).group(0)
# Page_number from the argument gets formatted in the URL & Fetched
formatted_url = seed_url.format(str(page_number))
# print(url_pat,source_url,formatted_url)
try:
html_text = requests.get(formatted_url).text
# print(html_text)
# Prepare the soup
soup = BeautifulSoup(html_text, "html.parser")
print(soup.find_all(id="chapterContent")[0]["style"])
print(f"Now Scraping - {formatted_url}")
# help = soup.find_all("div",class_="chapter-content text-normal")[0].text.strip().encode("ascii", "ignore").decode("ascii")
# for node in soup.findAll("div",class_="chapter-content text-normal"):
# print(node)
# print(''.join(node.findAll(text=True)))
# for node in soup.findAll("div"):
# # print(node)
# print(''.join(node.findAll(text=True)))
# help = soup.find_all("div",class_="chapter-content text-normal")[0]
# print(''.join(help.findAll(text=True)))
# print(help)
except Exception as e:
return e
return true
if __name__ == "__main__":
# seed_url = "http://books.toscrape.com/catalogue/page-{}.html"
seed_url = "http://wnmtl.org/chapter/324909-heavenly-wolf-valley.html"
# seed_url = "http://wnmtl.org/chapter/{}.html"
print("Web scraping has begun")
result = browse_and_scrape(seed_url)
if result == True:
print("Web scraping is now complete!")
else:
print(f"Oops, That doesn't seem right!!! - {result}")`
All the commented stuff are things i've been trying to rip the text from the tag. From my inspection of the developer console in the browser, all the text is in the tag with id of chapter content. My plan is to iteratively get the text, stuff it, get the link for the next page and repeat but i've been stuck for a bit now, any suggestions.
Instead of scraping each page, you can directly get the text from this API endpoint using requests.
https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/324909
The last item in the above API is the chapter ID (324909). You can navigate to chapters by giving in the chapter IDs.
The next and prev chapter IDs are present in the current chapter's API endpoint. Have a look at the above URL in browser to understand it better.
Here is the full recursive code that writes the text from 3 pages to a file called novel.txt. You may change the number of pages and other details as per your need.
import requests
def get_data(chapter_id, pages):
if pages == 0:
return
url = 'https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/' + str(chapter_id)
r = requests.get(url)
x = r.json()
pre_id = x['data']['preId']
next_id = x['data']['nextId']
title = x['data']['title']
content = x['data']['content']
chapter_title = f'\n***** Chapter: {title} *****\n'
with open('novel.txt', 'a') as f:
f.write(chapter_title)
f.write(content + '\n')
print(f"Chapter: '{title}' written to file.")
get_data(next_id, pages-1)
curr_id = '324909'
get_data(curr_id, 3)
Chapter: 'Heavenly Wolf Valley' written to file.
Chapter: 'Leaving' written to file.
Chapter: 'Pure Fabrication' written to file.
Python code:
url = 'https://www.basketball-reference.com/players/'
initial = list(string.ascii_lowercase)
initial_url = [url + i for i in initial]
html_initial = [urllib.request.urlopen(i).read() for i in initial_url]
soup_initial = [BeautifulSoup(i, 'html.parser') for i in html_initial]
tags_initial = [i('a') for i in soup_initial]
print(tags_initial[0][50])
Results example:
Shareef Abdur-Rahim
From the example above, I want to extract the name of the players which is 'Shareef Abdur-Rahim', but I want to do it for all the tags_initial lists,
Does anyone have an idea?
Could you modify your post by adding your code so that we can help you better?
Maybe that could help you :
name = soup.findAll(YOUR_SELECTOR)[0].string
UPDATE
import re
import string
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://www.basketball-reference.com/players/'
# Alphabet
initial = list(string.ascii_lowercase)
datas = []
# URLS
urls = [url + i for i in initial]
for url in urls:
# Soup Object
soup = BeautifulSoup(urlopen(url), 'html.parser')
# Players link
url_links = soup.findAll("a", href=re.compile("players"))
for link in url_links:
# Player name
datas.append(link.string)
print("datas : ", datas)
Then, "datas" contains all the names of the players, but I advise you to do a little processing afterwards to remove some erroneous information like "..." or perhaps duplicates
There are probably better ways but I'd do it like this:
html = "a href=\"/teams/LAL/2021.html\">Los Angeles Lakers</a"
index = html.find("a href")
index = html.find(">", index) + 1
index_end = html.find("<", index)
print(html[index:index_end])
If you're using a scraper library it probably has a similar function built-in.
I am trying to get all emails from a specific page and separate them into an individual variable or even better a dictionary. This is some code.
import requests
import re
import json
from bs4 import BeautifulSoup
page = "http://www.example.net"
info = requests.get(page)
if info.status_code == 200:
print("Page accessed")
else:
print("Error accessing page")
code = info.content
soup = BeautifulSoup(code, 'lxml')
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
print(allEmails)
sep = ","
allEmailsStr = str(allEmails)
print(type(allEmails))
print(type(allEmailsStr))
j = allEmailsStr.split(sep, 1)[0]
print(j)
Excuse the poor variable names because I put this together so it would be fine by itself. The output from the example website would be for example something like
[k, kolyma, location, balkans]
So if I ran the problem it would return only
[k
But if I wanted it to return every email on there individually how would I do that?
To get just the email str you can try:
emails = []
for email_link in allEmails:
emails.append(email_link.get("href").replace('mailto:', ''))
print(emails)
Based on your expected output, you can use the unwrap function of BeautifulSoup
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
for Email in allEmails:
print(Email.unwrap()) #This will print the whole element along with tag
# k
I have some HTML formatted text I've got with BeautifulSoup. I'd like to convert all italic (tag i), bold (b) and links (a href) to Word format via docx run command.
I can make a paragraph:
p = document.add_paragraph('text')
I can ADD next sequence as bold/italic:
p.add_run('bold').bold = True
p.add_run('italic.').italic = True
Intuitively, I could find all particular tags (ie. soup.find_all('i')) and then watch indices and then concatenate partial strings...
...but maybe there's a better, more elegant way?
I don't want libraries or solutions that just convert a html page to word and save them. I want a little more control.
I got nowhere with a dictionary. Here is the code and visual wrong (from code) and right (desired) result:
from docx import Document
import os
from bs4 import BeautifulSoup
html = 'hi, I am link this is some nice regular text. <i> oooh, but I am italic</i> ' \
' or I can be <b>bold</b> '\
' or even <i><b>bold and italic</b></i>'
def get_tags(text):
soup = BeautifulSoup(text, "html.parser")
tags = {}
tags["i"] = soup.find_all("i")
tags["b"] = soup.find_all("b")
return tags
def make_test_word():
document = Document()
document.add_heading('Demo HTML', 0)
soup = BeautifulSoup(html, "html.parser")
p = document.add_paragraph(html)
# p.add_run('bold').bold = True
# p.add_run(' and some ')
# p.add_run('italic.').italic = True
file_name="demo_html.docx"
document.save(file_name)
os.startfile(file_name)
make_test_word()
I just wrote a bit of code to convert the text from a tkinter Text widget over to a word document, including any bold tags that the user can add. This isn't a complete solution for you, but it may help you to start toward a working solution. I think you're going to have to do some regex work to get the hyperlinks transferred to the word document. Stacked formatting tags may also get tricky. I hope this helps:
from docx import Document
html = 'HTML string <b>here</b>.'
html = html.split('<')
html = [html[0]] + ['<'+l for l in html[1:]]
doc = Document()
p = doc.add_paragraph()
for run in html:
if run.startswith('<b>'):
run = run.lstrip('<b>')
runner = p.add_run(run)
runner.bold = True
elif run.startswith('</b>'):
run = run.lstrip('</b>')
runner = p.add_run(run)
else:
p.add_run(run)
doc.save('test.docx')
I came back to it and made it possible to parse out multiple formatting tags. This will keep a tally of what formatting tags are in play in a list. At each tag, a new run is created, and formatting for the run is set by the current tags in play.
from docx import Document
import re
import docx
from docx.shared import Pt
from docx.enum.dml import MSO_THEME_COLOR_INDEX
def add_hyperlink(paragraph, text, url):
# This gets access to the document.xml.rels file and gets a new relation id value
part = paragraph.part
r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
# Create the w:hyperlink tag and add needed values
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )
# Create a w:r element and a new w:rPr element
new_run = docx.oxml.shared.OxmlElement('w:r')
rPr = docx.oxml.shared.OxmlElement('w:rPr')
# Join all the xml elements together add add the required text to the w:r element
new_run.append(rPr)
new_run.text = text
hyperlink.append(new_run)
# Create a new Run object and add the hyperlink into it
r = paragraph.add_run ()
r._r.append (hyperlink)
# A workaround for the lack of a hyperlink style (doesn't go purple after using the link)
# Delete this if using a template that has the hyperlink style in it
r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK
r.font.underline = True
return hyperlink
html = '<H1>I want to</H1> <u>convert HTML to docx in <b>bold and <i>bold italic</i></b>.</u>'
html = html.split('<')
html = [html[0]] + ['<'+l for l in html[1:]]
tags = []
doc = Document()
p = doc.add_paragraph()
for run in html:
tag_change = re.match('(?:<)(.*?)(?:>)', run)
if tag_change != None:
tag_strip = tag_change.group(0)
tag_change = tag_change.group(1)
if tag_change.startswith('/'):
if tag_change.startswith('/a'):
tag_change = next(tag for tag in tags if tag.startswith('a '))
tag_change = tag_change.strip('/')
tags.remove(tag_change)
else:
tags.append(tag_change)
else:
tag_strip = ''
hyperlink = [tag for tag in tags if tag.startswith('a ')]
if run.startswith('<'):
run = run.replace(tag_strip, '')
if hyperlink:
hyperlink = hyperlink[0]
hyperlink = re.match('.*?(?:href=")(.*?)(?:").*?', hyperlink).group(1)
add_hyperlink(p, run, hyperlink)
else:
runner = p.add_run(run)
if 'b' in tags:
runner.bold = True
if 'u' in tags:
runner.underline = True
if 'i' in tags:
runner.italic = True
if 'H1' in tags:
runner.font.size = Pt(24)
else:
p.add_run(run)
doc.save('test.docx')
Hyperlink function thanks to this question. My concern here is that you will need to manually code for every HTML tag that you want to carry over to the docx. I imagine that could be a large number. I've given some examples of tags you may want to account for.
Alternatively, you can just save your html code as a string and do:
from htmldocx import HtmlToDocx
new_parser = HtmlToDocx()
new_parser.parse_html_file("html_filename", "docx_filename")
#Files extensions not needed, but tolerated
I have to write a program that will read the HTML from this link(http://python-data.dr-chuck.net/known_by_Maira.html), extract the href= values from the anchor tags, scan for a tag that is in a particular position relative to the first name in the list, follow that link and repeat the process a number of times and report the last name you find.
I am supposed to find the link at position 18 (the first name is 1), follow that link and repeat this process 7 times. The answer is the last name that I retrieve.
Here is the code I found and it works just fine.
import urllib
from BeautifulSoup import *
url = raw_input("Enter URL: ")
count = int(raw_input("Enter count: "))
position = int(raw_input("Enter position: "))
names = []
while count > 0:
print "retrieving: {0}".format(url)
page = urllib.urlopen(url)
soup = BeautifulSoup(page)
tag = soup('a')
name = tag[position-1].string
names.append(name)
url = tag[position-1]['href']
count -= 1
print names[-1]
I would really appreciate if someone could explain to me like you would to a 10 year old, what's going on inside the while loop. I am new to Python and would really appreciate the guidance.
while count > 0: # because of `count -= 1` below,
# will run loop count times
print "retrieving: {0}".format(url) # just prints out the next web page
# you are going to get
page = urllib.urlopen(url) # urls reference web pages (well,
# many types of web content but
# we'll stick with web pages)
soup = BeautifulSoup(page) # web pages are frequently written
# in html which can be messy. this
# package "unmessifies" it
tag = soup('a') # in html you can highlight text and
# reference other web pages with <a>
# tags. this get all of the <a> tags
# in a list
name = tag[position-1].string # This gets the <a> tag at position-1
# and then gets its text value
names.append(name) # this puts that value in your own
# list.
url = tag[position-1]['href'] # html tags can have attributes. On
# and <a> tag, the href="something"
# attribute references another web
# page. You store it in `url` so that
# its the page you grab on the next
# iteration of the loop.
count -= 1
You enter the number of urls you want to retrieve from a page
0) prints url
1) opens url
2) reads source
BeautifulSoup docs
3) gets every a tag
4) gets the whole <a ...></a> I think
5) adds it to a list names
6) gets url from the last item of names, ie pulls href from <a ...></a>
7) prints the last of the list names
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
total=0
url = input('Enter - ')
c=input('enter count-')
count=int(c)
p=input('enter position-')
pos=int(p)
while total<=count:
html = urllib.request.urlopen(url, context=ctx).read()
print("Retrieving",url)
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
counter=0
for tag in tags:
counter=counter+1
if(counter<=pos):
x=tag.get('href',None)
url=x
else:
break
total=total+1
Solution with explanations.
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
url = input('Enter - ')
count = int(input('Enter count: '))
position = int(input ('Enter position: '))
names = []
while count > 0:
print('Retrieving: {}'.format(url))
html = urllib.request.urlopen(url) # open the url using urllib
soup = BeautifulSoup(html, 'html.parser')# parse html data in a clean format
# Retrieve all of the anchor tags
tags = soup('a')
# This gets the <a> tag at position-1 and then gets its text value
name = tags[position-1].string
names.append(name) #add the name to our list
url = tags[position-1]['href']#retrieve the url for next iteratopn
count -= 1
print(names)
print('Answer: ',names[count-1])
Hope it helps.