Printing an HTML Output in Python - python

I've been creating a program with a variety of uses. I call it the Electronic Database with Direct Yield (EDDY). One thing that I have been having the most trouble with is EDDY's google search capabilities. EDDY will ask the user to give an input. EDDY will then edit the input slightly by replacing any spaces (' ') with plus signs ('+'), then go to the resulting url (without opening a browser). It then copies the html from the webpage and is SUPPOSED to give the results and descriptions of the site, and to specify, without the HTML code.
This is what I have so far.
import urllib
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import requests
def cleanup(url):
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
length = len(soup.prettify()) - 1
print(soup.prettify()[16800:length])
print(soup.title.text)
print(soup.body.text)
def eddysearch():
headers = {'User-Agent': 'Chrome.exe'}
reg_url = "http://www.google.com/search?q="
print("Ready for query")
query = input()
if(query != "quit"):
print("Searching for keyword: " + query)
print("Please wait...")
search = urllib.parse.quote_plus(query)
url = reg_url + search
req = Request(url=url, headers=headers)
html = urlopen(req).read()
cleanup(url)
eddysearch()
eddysearch()
Can anyone help me out? Thanks in advance!

hIf you dont want to use an SSL certificate, you can do .read()
# Python 2.7.x
import urllib
url = "http://stackoverflow.com"
f = urllib.urlopen(url)
print f.read()
#Python 3.x
import urllib.request
url = 'http://www.stackoverflow.com'
f = urllib.request.urlopen(url)
print(f.read())

Related

how can i access a youtube 1st search result?

actually i tried a code but it doesnt work could any one help me to fix it
its actually saying that the video_link is not defined
i think error in for link in soup.find_all('a'):
import os
import glob
from bs4 import BeautifulSoup
import urllib
from urllib.parse import quote_plus as qp
DEFAULT_AUDIO_QUALITY = '320K'
search = ' '
# We do not want to except empty inputs :)
while search == '':
search = raw_input('Enter your query ')
search = qp(search)
print('Making a Query Request! ')
response = urllib.request.urlopen('https://www.youtube.com/results?search_query='+search)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
for link in soup.find_all('a'):
if '/watch?v=' in link.get('href'):
print(link.get('href'))
# May change when Youtube Website may get updated in the future.
video_link = link.get('href')
break
video_link = 'http://www.youtube.com/'+video_link
command = ('youtube-dl --extract-audio --audio-format mp3 --audio-quality ' +
DEFAULT_AUDIO_QUALITY + ' ' +video_link)
print ('Downloading...')
os.system(command)
but this is giving error
To get correct version of YouTube HTML page, use correct User-Agent HTTP header.
For example:
import requests
from bs4 import BeautifulSoup
search = 'tree'
headers = {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}
html = requests.get('https://www.youtube.com/results?search_query='+search, headers=headers).text
soup = BeautifulSoup(html, 'html.parser')
for link in soup.find_all('a'):
if '/watch?v=' in link.get('href'):
print(link.get('href'))
# May change when Youtube Website may get updated in the future.
video_link = link.get('href')
Prints:
/watch?v=ZKAM_Hk4eZ0
/watch?v=ZKAM_Hk4eZ0
/watch?v=wCQfkEkePx8
/watch?v=wCQfkEkePx8
/watch?v=Va0vs1fhhNI
/watch?v=Va0vs1fhhNI
/watch?v=kUDPr5xPYhM
/watch?v=kUDPr5xPYhM
/watch?v=kSjXOebB7eI
/watch?v=kSjXOebB7eI
/watch?v=IiDkVftBgak
/watch?v=IiDkVftBgak
/watch?v=F3hTW9e20d8
/watch?v=F3hTW9e20d8
/watch?v=Iy-dJwHVX84
/watch?v=Iy-dJwHVX84
... etc.

How to show full content of another website in Django?

I am trying to get the full content of another website, or modify the links that are clicked on when people use other websites on my site in django?
import requests
import urllib.request
def one(request, myurl='google.com'):
url = 'http://' + myurl
r = requests.get(url)
return HttpResponse(r)
The outcome of requests.get is a Response [requests-doc] object, not a string. You can obtain the content with content [requests-doc]. For example:
import requests
import urllib.request
def one(request, myurl='google.com'):
url = 'http://' + myurl
r = requests.get(url)
return HttpResponse(
content=r.content,
content_type=r.headers.get('Content-Type'),
status=r.status_code
)

Unable to extract images from webpage in Python using beautiful sou

I am trying to extract all the images from below URL, However, I don't understand the HTTP Error 403: Forbidden, Can it be taken care of during error handling, or simply the URL cant be scraped due to limitations?
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib.request
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html)
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
images = [img for img in soup.findAll('img')]
print (str(len(images)) + "images found.")
print("downloading to current directory ")
#compile our unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
filename=each.split('/')[-1]
urllib.request.urlretrieve(each,filename)
return image_links
get_images("https://opensignal.com/reports/2019/04/uk/mobile-network-experience")
some sites need you to specify User-Agent header
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import urllib.request
def make_soup(url):
site = url
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site, headers=hdr)
page = urlopen(req)
return BeautifulSoup(page)
You can use this function for image scraping. using img tag along not useful nowadays .we can implement something like below, that will fulfill the requirement. It's not relay on any tags so wherever image link is present it will grab it.
def extract_ImageUrl(soup_chunk):
urls_found = []
for tags in soup_chunk.find_all():
attributes = tags.attrs
if str(attributes).__contains__('http'):
for links in attributes.values():
if re.match('http.*\.jpg|png',str(links)):
if len(str(links).split()) <=1:
urls_found.append(links)
else:
link = [i.strip() for i in str(links).split() if re.match('http.*\.jpg|png',str(i))]
urls_found = urls_found + link
print("Found {} image links".format(len(urls_found)))
return urls_found
It's an initial thought, require updates to make it very better.

How can i download all types of file in python with request library

I am building the crawler in python and i have the list of href from the page.
Now i have the list of file extensions to download like
list = ['zip','rar','pdf','mp3']
How can i save the files from that url to local directory using python
EDIT:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.example.com/downlaod"
site = urllib2.urlopen(url)
html = site.read()
soup = BeautifulSoup(html)
list_urls = soup.find_all('a')
print list_urls[6]
Going by your posted example:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.example.com/downlaod"
site = urllib2.urlopen(url)
html = site.read()
soup = BeautifulSoup(html)
list_urls = soup.find_all('a')
print list_urls[6]
So, the URL you want to fetch next is presumably list_urls[6]['href'].
The first trick is that this might be a relative URL rather than absolute. So:
newurl = list_urls[6]['href']
absurl = urlparse.urljoin(site.url, newurl)
Also, you want to only fetch the file if it has the right extension, so:
if not absurl.endswith(extensions):
return # or break or whatever
But once you've decided what URL you want to download, it's no harder than your initial fetch:
page = urllib2.urlopen(absurl)
html = page.read()
path = urlparse.urlparse(absurl).path
name = os.path.basename(path)
with open(name, 'wb') as f:
f.write(html)
That's mostly it.
There are a few things you might want to add, but if so, you have to add them all manually. For example:
Look for a Content-disposition header with a suggested filename to use in place of the URL's basename.
copyfile from page to f instead of reading the whole thing into memory and then writeing it out.
Deal with existing files with the same name.
…
But that's the basics.
You can use python requests library as you have asked in question : http://www.python-requests.org
You can save file from url like this :
import requests
url='http://i.stack.imgur.com/0LJdh.jpg'
data=requests.get(url).content
filename="image.jpg"
with open(filename, 'wb') as f:
f.write(data)
solution using urllib3
import os
import urllib3
from bs4 import BeautifulSoup
import urllib.parse
url = "https://path/site"
site = urllib3.PoolManager()
html = site.request('GET', url)
soup = BeautifulSoup(html.data, "lxml")
list_urls = soup.find_all('a')
and then a recursive function to get all the files
def recursive_function(list_urls)
newurl = list_urls[0]['href']
absurl = url+newurl
list_urls.pop(0)
if absurl.endswith(extensions): # verify if contains the targeted extensions
page = urllib3.PoolManager()
html = site.request('GET', absurl)
name = os.path.basename(absurl)
with open(name, 'wb') as f:
f.write(html.data)
return recursive_function(list_urls)

How can I read the contents of an URL with Python?

The following works when I paste it on the browser:
http://www.somesite.com/details.pl?urn=2344
But when I try reading the URL with Python nothing happens:
link = 'http://www.somesite.com/details.pl?urn=2344'
f = urllib.urlopen(link)
myfile = f.readline()
print myfile
Do I need to encode the URL, or is there something I'm not seeing?
To answer your question:
import urllib
link = "http://www.somesite.com/details.pl?urn=2344"
f = urllib.urlopen(link)
myfile = f.read()
print(myfile)
You need to read(), not readline()
EDIT (2018-06-25): Since Python 3, the legacy urllib.urlopen() was replaced by urllib.request.urlopen() (see notes from https://docs.python.org/3/library/urllib.request.html#urllib.request.urlopen for details).
If you're using Python 3, see answers by Martin Thoma or i.n.n.m within this question:
https://stackoverflow.com/a/28040508/158111 (Python 2/3 compat)
https://stackoverflow.com/a/45886824/158111 (Python 3)
Or, just get this library here: http://docs.python-requests.org/en/latest/ and seriously use it :)
import requests
link = "http://www.somesite.com/details.pl?urn=2344"
f = requests.get(link)
print(f.text)
For python3 users, to save time, use the following code,
from urllib.request import urlopen
link = "https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html"
f = urlopen(link)
myfile = f.read()
print(myfile)
I know there are different threads for error: Name Error: urlopen is not defined, but thought this might save time.
None of these answers are very good for Python 3 (tested on latest version at the time of this post).
This is how you do it...
import urllib.request
try:
with urllib.request.urlopen('http://www.python.org/') as f:
print(f.read().decode('utf-8'))
except urllib.error.URLError as e:
print(e.reason)
The above is for contents that return 'utf-8'. Remove .decode('utf-8') if you want python to "guess the appropriate encoding."
Documentation:
https://docs.python.org/3/library/urllib.request.html#module-urllib.request
A solution with works with Python 2.X and Python 3.X makes use of the Python 2 and 3 compatibility library six:
from six.moves.urllib.request import urlopen
link = "http://www.somesite.com/details.pl?urn=2344"
response = urlopen(link)
content = response.read()
print(content)
We can read website html content as below :
from urllib.request import urlopen
response = urlopen('http://google.com/')
html = response.read()
print(html)
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Works on python 3 and python 2.
# when server knows where the request is coming from.
import sys
if sys.version_info[0] == 3:
from urllib.request import urlopen
else:
from urllib import urlopen
with urlopen('https://www.facebook.com/') as \
url:
data = url.read()
print data
# When the server does not know where the request is coming from.
# Works on python 3.
import urllib.request
user_agent = \
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
url = 'https://www.facebook.com/'
headers = {'User-Agent': user_agent}
request = urllib.request.Request(url, None, headers)
response = urllib.request.urlopen(request)
data = response.read()
print data
from urllib.request import urlopen
# if has Chinese, apply decode()
html = urlopen("https://blog.csdn.net/qq_39591494/article/details/83934260").read().decode('utf-8')
print(html)
from urllib.request import urlopen
from bs4 import BeautifulSoup
link = "https://www.timeshighereducation.com/hub/sinorbis"
f = urlopen(link)
soup = BeautifulSoup(f, 'html.parser')
# get the text content of the webpage
text = soup.get_text()
print(text)
using BeautifulSoup's HTML parser we can extract the content of the webpage.
I used the following code:
import urllib
def read_text():
quotes = urllib.urlopen("https://s3.amazonaws.com/udacity-hosted-downloads/ud036/movie_quotes.txt")
contents_file = quotes.read()
print contents_file
read_text()
# retrieving data from url
# only for python 3
import urllib.request
def main():
url = "http://docs.python.org"
# retrieving data from URL
webUrl = urllib.request.urlopen(url)
print("Result code: " + str(webUrl.getcode()))
# print data from URL
print("Returned data: -----------------")
data = webUrl.read().decode("utf-8")
print(data)
if __name__ == "__main__":
main()
The URL should be a string:
import urllib
link = "http://www.somesite.com/details.pl?urn=2344"
f = urllib.urlopen(link)
myfile = f.readline()
print myfile

Categories

Resources