How to read html body from web-site using Python version 3x - python

I would like to connect and receive http response from a specific web site link.
I have many Python codes :
import urllib.request
import os,sys,re,datetime
fp = urllib.request.urlopen("http://www.python.org")
mybytes = fp.read()
mystr = mybytes.decode(encoding=sys.stdout.encoding)
fp.close()
when I pass the response as a parameter to:
BeautifulSoup(str(mystr), 'html.parser')
to get the cleaned html text, I got the following error:
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u25bc' in position 1139: character maps to <undefined>.
The question how can I solve this problem?
complete code :
import urllib.request
import os,sys,re,datetime
fp = urllib.request.urlopen("http://www.python.org")
mybytes = fp.read()
mystr = mybytes.decode(encoding=sys.stdout.encoding)
fp.close()
from bs4 import BeautifulSoup
soup = BeautifulSoup(str(mystr), 'html.parser')
mystr = soup;
print(mystr.get_text())

BeautifulSoup is perfectly happy to consume the file-like object returned by urlopen:
from urllib.request import urlopen
from bs4 import BeautifulSoup
with urlopen("...") as website:
soup = BeautifulSoup(website)
print(soup.prettify())

If you use the requests library you can avoid these complications:)
import requests
fp = requests.get("http://www.python.org")
mystr = fp.text
from bs4 import BeautifulSoup
soup = BeautifulSoup(mystr, 'html.parser')
mystr = soup;
print(mystr.get_text())

Related

Get strange letters from Arabic alphabet when scrape an Arabic website

I would to scrape this site: http://waqfeya.com/book.php?bid=1
but when I do I get characters like these ÇáÞÑÂä ÇáßÑíã .
This how looks my script:
import requests
from bs4 import BeautifulSoup
BASE_URL = "http://waqfeya.com/book.php?bid=1"
source = requests.get(BASE_URL)
soup = BeautifulSoup(source.text, 'lxml')
print(soup)
I tried these things but don't work for me:
source.encoding = 'utf-8'
and this:
source.encoding = 'ISO-8859-1'
also this:
soup = BeautifulSoup(source.text, from_endocing='ISO-8859-1')
​But none worked for me.
Use urlopen instead of request
from bs4 import BeautifulSoup
from urllib import urlopen
BASE_URL = "http://waqfeya.com/book.php?bid=1"
open = urlopen(BASE_URL)
soup = BeautifulSoup(open, 'lxml')
print(soup.encode('utf-8'))
Sometimes Requests may get the encoding wrong. For this site we can get the correct encoding from the Source.
You can assign the encoding like source.encoding='windows-1256' before using source.text in BeautifulSoup.
import requests
BASE_URL = "http://waqfeya.com/book.php?bid=1"
source = requests.get(BASE_URL)
print(source.encoding)
print(source.apparent_encoding)
source.encoding='windows-1256'
print(source.text)
I was able to get all the Arabic characters properly.

Encoding issues request results

im learning python, and im trying to retrieve data from wikipedia, but is giving me encoding issues on special charecters of the links, text, etc:
My code:
import sys
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("https://pt.wikipedia.org/wiki/Jair_Bolsonaro")
bsObj = BeautifulSoup(html)
for link in bsObj.findAll("a"):
if 'href' in link.attrs:
print(link.attrs['href'])
result:
/wiki/Hamilton_Mour%C3%A3o
/wiki/Michel_Temer
/wiki/C%C3%A2mara_dos_Deputados_do_Brasil
...
Should be:
/wiki/Hamilton_Mourão
/wiki/Michel_Temer
/wiki/Câmara_dos_Deputados_do_Brasil
...
Solution:
import urllib.parse
And in print line changed to:
print(urllib.parse.unquote(link.attrs['href']))

Python: error while trying to scrape youtube

while trying to scrape the homepage of youtube for the titles of each video im running this code
import request
from bs4 import BeautifulSoup
url = 'https://www.youtube.com'
html = requests.get(url)
soup = BeautifulSoup(html.content, "html.parser")
print(soup('a'))
and its returning this error
Traceback (most recent call last):
File "C:\Users\kenda\OneDrive\Desktop\Projects\youtube.py", line 7, in <
<module>
print(soup('a'))
File "C:\Users\kenda\AppData\Local\Programs\Python\Python36-
32\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f384' in
position 45442: character maps to <undefined>
[Finished in 4.83s]
how do i fix this? and why is it doing this specifically when im scraping youtube
Urllib is much better and it's comfortable for use.
from urllib.request import urlopen
from bs4 import BeautifulSoup
The urlopen function will transform the url into html
url = 'https://www.youtube.com'
html = urlopen(url)
beautifulsoup will perse the html
soup = BeautifulSoup(html, 'html.parser')
print(soup.find_all('a'))
if you absolutely want to do it with requests, the solution is:
import requests
from bs4 import BeautifulSoup
url = 'https://www.youtube.com'
resp = requests.get(url)
html = resp.text
soup = BeautifulSoup(html, 'html.parser')
print(soup.find_all('a'))

Unicode Encode Error: Charmap cannot encode character \xa9 in Python

Hi there I am writing scraping code but when i try to get all paragraph from website it give me following error
Unicode Encode Error: Charmap cannot encode character '\xa9'
here is my code:
#Loading Libraries
import urllib
from urllib.parse import urlparse
from urllib.parse import urljoin
import urllib.request
from bs4 import BeautifulSoup
#define URL for scraping
newsurl = "http://www.techspot.com/news/67832-netflix-exceeds-growth-expectations-home-abroad-stock-soars.html"
thepage = urllib.request.urlopen(newsurl)
soup = BeautifulSoup(thepage ,"html.parser")
article = soup.find_all('div' , {'class','articleBody'})
for pg in article:
paragraph = soup.findAll('p')
ptag = paragraph
print(ptag)
Error I am getting is following:
Let me how to remove this error
soup.findAll() returns a ResultSet object which is basically a list which does not have an attribute encode. You either meant to use .text instead:
text = soup.text
Or, "join" the texts:
text = "".join(soup.findAll(whatever, you, want))
At times this error occurs while using Beautiful soup 4 or bs4 or using getData requests or command . So try using the below mentioned code with your print statement.
print(myHtmlData.encode("utf-8"))

How can I read the contents of an URL with Python?

The following works when I paste it on the browser:
http://www.somesite.com/details.pl?urn=2344
But when I try reading the URL with Python nothing happens:
link = 'http://www.somesite.com/details.pl?urn=2344'
f = urllib.urlopen(link)
myfile = f.readline()
print myfile
Do I need to encode the URL, or is there something I'm not seeing?
To answer your question:
import urllib
link = "http://www.somesite.com/details.pl?urn=2344"
f = urllib.urlopen(link)
myfile = f.read()
print(myfile)
You need to read(), not readline()
EDIT (2018-06-25): Since Python 3, the legacy urllib.urlopen() was replaced by urllib.request.urlopen() (see notes from https://docs.python.org/3/library/urllib.request.html#urllib.request.urlopen for details).
If you're using Python 3, see answers by Martin Thoma or i.n.n.m within this question:
https://stackoverflow.com/a/28040508/158111 (Python 2/3 compat)
https://stackoverflow.com/a/45886824/158111 (Python 3)
Or, just get this library here: http://docs.python-requests.org/en/latest/ and seriously use it :)
import requests
link = "http://www.somesite.com/details.pl?urn=2344"
f = requests.get(link)
print(f.text)
For python3 users, to save time, use the following code,
from urllib.request import urlopen
link = "https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html"
f = urlopen(link)
myfile = f.read()
print(myfile)
I know there are different threads for error: Name Error: urlopen is not defined, but thought this might save time.
None of these answers are very good for Python 3 (tested on latest version at the time of this post).
This is how you do it...
import urllib.request
try:
with urllib.request.urlopen('http://www.python.org/') as f:
print(f.read().decode('utf-8'))
except urllib.error.URLError as e:
print(e.reason)
The above is for contents that return 'utf-8'. Remove .decode('utf-8') if you want python to "guess the appropriate encoding."
Documentation:
https://docs.python.org/3/library/urllib.request.html#module-urllib.request
A solution with works with Python 2.X and Python 3.X makes use of the Python 2 and 3 compatibility library six:
from six.moves.urllib.request import urlopen
link = "http://www.somesite.com/details.pl?urn=2344"
response = urlopen(link)
content = response.read()
print(content)
We can read website html content as below :
from urllib.request import urlopen
response = urlopen('http://google.com/')
html = response.read()
print(html)
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Works on python 3 and python 2.
# when server knows where the request is coming from.
import sys
if sys.version_info[0] == 3:
from urllib.request import urlopen
else:
from urllib import urlopen
with urlopen('https://www.facebook.com/') as \
url:
data = url.read()
print data
# When the server does not know where the request is coming from.
# Works on python 3.
import urllib.request
user_agent = \
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
url = 'https://www.facebook.com/'
headers = {'User-Agent': user_agent}
request = urllib.request.Request(url, None, headers)
response = urllib.request.urlopen(request)
data = response.read()
print data
from urllib.request import urlopen
# if has Chinese, apply decode()
html = urlopen("https://blog.csdn.net/qq_39591494/article/details/83934260").read().decode('utf-8')
print(html)
from urllib.request import urlopen
from bs4 import BeautifulSoup
link = "https://www.timeshighereducation.com/hub/sinorbis"
f = urlopen(link)
soup = BeautifulSoup(f, 'html.parser')
# get the text content of the webpage
text = soup.get_text()
print(text)
using BeautifulSoup's HTML parser we can extract the content of the webpage.
I used the following code:
import urllib
def read_text():
quotes = urllib.urlopen("https://s3.amazonaws.com/udacity-hosted-downloads/ud036/movie_quotes.txt")
contents_file = quotes.read()
print contents_file
read_text()
# retrieving data from url
# only for python 3
import urllib.request
def main():
url = "http://docs.python.org"
# retrieving data from URL
webUrl = urllib.request.urlopen(url)
print("Result code: " + str(webUrl.getcode()))
# print data from URL
print("Returned data: -----------------")
data = webUrl.read().decode("utf-8")
print(data)
if __name__ == "__main__":
main()
The URL should be a string:
import urllib
link = "http://www.somesite.com/details.pl?urn=2344"
f = urllib.urlopen(link)
myfile = f.readline()
print myfile

Categories

Resources