import urllib2
import urllib
from BeautifulSoup import BeautifulSoup # html
from BeautifulSoup import BeautifulStoneSoup # xml
import BeautifulSoup # everything
import re
f = o.open( 'http://www.google.com', p)
html = f.read()
f.close()
soup = BeautifulSoup(html)
Getting an error saying the line with soup = BeautifulSoup(html) says 'module' object is not callable.
Your import BeautifulSoup makes BeautifulSoup refer to the module, not the class as it did after from BeautifulSoup import BeautifulSoup. If you're going to import the whole module, you might want to omit the from ... line or perhaps rename the class afterward:
from BeautifulSoup import BeautifulSoup
Soup = BeautifulSoup
...
import BeautifulSoup
....
soup = Soup(html)
#Blair's answer has the right slant but I'd perform some things slightly differently, i.e.:
import BeautifulSoup
Soup = BeautifulSoup.BeautifulSoup
(recommended), or
import BeautifulSoup
from BeautifulSoup import BeautifulSoup as Soup
(not bad either).
Install BeautifulSoup4
sudo easy_install BeautifulSoup4
Recommendation
from bs4 import BeautifulSoup
Related
import requests
from bs4 import BeautifulSoup
result=requests.get('http://textfiles.com/stories/').text
soup=BeautifulSoup (result, 'lxml')
stories=soup.find_all('tr')
print (stories)
The find method works but find_all doesn't I'm not sure why maybe it is because it doesn't have a class?
correct code is
import requests
from bs4 import BeautifulSoup
result=requests.get('http://textfiles.com/stories/')
soup = BeautifulSoup(result.content, 'html5lib')
stories=soup.find_all('tr')
you can access each 'tr' by
stories[0]
0 can be replaced with any number in list
You can also use Pandas
eg
import pandas
import requests
from bs4 import BeautifulSoup
result=requests.get('http://textfiles.com/stories/')
soup = BeautifulSoup(result.content, 'html5lib')
df=pandas.read_html(soup.prettify())
print(df)
I have this view in Anaconda.
However, I can't see to utilize BS in my script.
import string
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
#import BeautifulSoup as bs
alphabets = string.ascii_lowercase
for i in alphabets:
#print(i)
html = urlopen("http://www.airlineupdate.com/content_public/codes/airportcodes/airports-by-iata/iata-" + i + ".htm")
print(html)
for j in html:
#soup = bs4(html, "html.parser")
soup = bs(html, "html.parser")
f = open('C:\\Users\\Excel\\Desktop\\URL.txt', 'w')
When I try to run the code above, I get the following error:
ModuleNotFoundError: No module named 'BeautifulSoup4'
Can someone enlighten me as to what's going on here?
from documentation its
from bs4 import BeautifulSoup
and based on your code, it seems like you want to use it as bs()
from bs4 import BeautifulSoup as bs
I am trying to get the HTML source of a web page using beautifulsoup.
import bs4 as bs
import requests
import urllib.request
sourceUrl='https://www.pakwheels.com/forums/t/planing-a-trip-from-karachi-to-lahore-by-road-in-feb-2017/414115/2.html'
source=urllib.request.urlopen(sourceUrl).read()
soup=bs.BeautifulSoup(source,'html.parser')
print(soup)
I want the HTML source of the page. This is what I am getting now:
'ps.store("siteSettings", {"title":"PakWheels Forums","contact_email":"sami.ullah#pakeventures.com","contact_url":"https://www.pakwheels.com/main/contact_us","logo_url":"https://www.pakwheels.com/assets/logo.png","logo_small_url":"/images/d-logo-sketch-small.png","mobile_logo_url":"data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4NCjwhLS0gR2VuZXJhdG9yOiBBZG9iZSBJbGx1c3RyYXRvciAxNi4wLjAsIFNWRyBFeHBvcnQgUGx1Zy1JbiAuIFNWRyBWZXJzaW9uOiA2LjAwIEJ1aWxkIDApICAtLT4NCjwhRE9DVFlQRSBzdmcgUFVCTElDICItLy9XM0MvL0RURCBTVkcgMS4xLy9FTiIgImh0dHA6Ly93d3cudzMub3JnL0dyYXBoaWNzL1NWRy8xLjEvRFREL3N2ZzExLmR0ZCI+DQo8c3ZnIHZlcnNpb249IjEuMSIgaWQ9IkxheWVyXzEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgeG1sbnM6eGxpbms9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkveGxpbmsiIHg9IjBweCIgeT0iMHB4Ig0KCSB3aWR0aD0iMjQwcHgiIGhlaWdodD0iNjBweCIgdmlld0JveD0iMCAwIDI0MCA2MCIgZW5hYmxlLWJhY2tncm91bmQ9Im5ldyAwIDAgMjQwIDYwIiB4bWw6c3BhY2U9InByZXNlcnZlIj4NCjxwYXRoIGZpbGw9IiNGRkZGRkYiIGQ9Ik02LjkwMiwyMy4yODZDMzQuNzc3LDIwLjI2Miw1Ny4yNC'
Have a look at this code:
from urllib import request
from bs4 import BeautifulSoup
url_1 = "http://www.google.com"
page = request.urlopen(url_1)
soup = BeautifulSoup(page)
print(soup.prettify())
Import everything you need correctly. Read this.
I'm trying to parse an XML page with BeautifulSoup and for some reason it's not able to find the XML parser. I don't think it's a path issue as I've used lxml to parse pages in the past, just not XML. Here's the code:
from bs4 import *
import urllib2
import lxml
from lxml import *
BASE_URL = "http://auctionresults.fcc.gov/Auction_66/Results/xml/round/66_115_database_round.xml"
proxy = urllib2.ProxyHandler({'http':'http://myProxy.com})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
page = urllib2.urlopen(BASE_URL)
soup = BeautifulSoup(page,"xml")
print soup
I'm probably missing something simple, but all the XML parsing with BS questions I found on here were around bs3 and I'm using bs4 which uses a different method for parsing XML. Thanks.
If you have lxml installed, just call that as BeautifulSoup's parser instead, like below.
Code:
from bs4 import BeautifulSoup as bsoup
import requests as rq
url = "http://auctionresults.fcc.gov/Auction_66/Results/xml/round/66_115_database_round.xml"
r = rq.get(url)
soup = bsoup(r.content, "lxml")
print soup
Result:
<html><body><dataroot xmlns:od="urn:schemas-microsoft-com:officedata" xmlns:xsi="http://www.w3.org/2000/10/XMLSchema-instance" xsi:nonamespaceschemalocation="66_database.xsd"><all_bids>
<auction_id>66</auction_id>
<auction_description>Advanced Wireless Services</auction_description>
... really long list follows...
[Finished in 34.9s]
Let us know if this helps.
I want to extract some data from a website. I saved it as 'Webpage, HTML Only', in a file called soccerway.html on my Desktop.
Afterwards I wrote the following command using an IPython notebook:
from bs4 import BeautifulSoup
soup=BeautifulSoup(open("soccerway.html"))
I get the following error:
IOError: [Errno 2] No such file or directory: 'soccerway.html'
How can I solve this?
You don't need to manually save a page. Use urllib2 to get the html source you need:
from bs4 import BeautifulSoup
from urllib2 import urlopen
soup = BeautifulSoup(urlopen("http://my_site.com/mypage"))
Example:
>>> from bs4 import BeautifulSoup
>>> from urllib2 import urlopen
>>> soup = BeautifulSoup(urlopen('http://google.com'))
>>> soup('a')
[<a class="gb1" href="http://www.google.com/imghp?hl=en&tab=wi">Images</a>,
...
]
You can use this code:
from bs4 import BeautifulSoup
file = open("yourfile.html", "r")
soup = BeautifulSoup(file, "html.parser")