I am trying to get the full content of another website, or modify the links that are clicked on when people use other websites on my site in django?
import requests
import urllib.request
def one(request, myurl='google.com'):
url = 'http://' + myurl
r = requests.get(url)
return HttpResponse(r)
The outcome of requests.get is a Response [requests-doc] object, not a string. You can obtain the content with content [requests-doc]. For example:
import requests
import urllib.request
def one(request, myurl='google.com'):
url = 'http://' + myurl
r = requests.get(url)
return HttpResponse(
content=r.content,
content_type=r.headers.get('Content-Type'),
status=r.status_code
)
Related
Looking to click the download as pdf button on this site: https://www.goffs.com/sales-results/sales/december-nh-sale-2021/1
The reason I can't just scrape the download link or just manually download it is that there are multiple of these sites like:
https://www.goffs.com/sales-results/sales/december-nh-sale-2021/2
https://www.goffs.com/sales-results/sales/december-nh-sale-2021/3
And I want to loop through all of them and download each as a pdf.
Current code:
import urllib.request
from requests import get
from bs4 import BeautifulSoup
url = "https://www.goffs.com/sales-results/sales/december-nh-sale-2021/1"
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
This code should get the link to the pdf:
from urllib.request import *
url = "https://www.goffs.com/sales-results/sales/december-nh-sale-2021/{}".format("1")
request = Request(url)
response = urlopen(request)
content = response.read().decode().split('<a href="https://www.goffs.com/GoffsCMS/_Sales/')
content = content[1].split('"')
content = content[0]
output = 'https://www.goffs.com/GoffsCMS/_Sales/'+content
print(output)
I am trying to get the redirected URL that https://trade.ec.europa.eu/doclib/html/153814.htm leads to (a pdf file).
I've so far tried
r = requests.get('https://trade.ec.europa.eu/doclib/html/153814.htm', allow_redirects = True)
print(r.url)
and it outputs the same old URL. I need the redirected URL which is https://trade.ec.europa.eu/doclib/docs/2015/september/tradoc_153814.pdf
Please try this code to see if it works for you
import urllib.request
import re
import requests
import PyPDF2
import io
from requests_html import HTMLSession
from urllib.parse import urlparse
from PyPDF2 import PdfFileReader
# Get Domain Name With urlparse
url = "https://trade.ec.europa.eu/doclib/html/153814.htm"
parsed_url = urlparse(url)
domain = parsed_url.scheme + "://" + parsed_url.netloc
# Get URL
session = HTMLSession()
r = session.get(url)
# Extract Links
jlinks = r.html.xpath('//a/#href')
# Remove bad links and replace relative path for absolute path
updated_links = []
for link in jlinks:
if re.search(".*#.*|.*javascript:.*|.*tel:.*",link):
link = ""
elif re.search("^(?!http).*",link):
link = domain + link
updated_links.append(link)
else:
updated_links.append(link)
r = requests.get(updated_links[0])
f = io.BytesIO(r.content)
reader = PdfFileReader(f)
contents = reader.getPage(0).extractText()
print(contents)
I think you should get a redirect link yourself (didn't found any way to do this with redirect), when you enter https://trade.ec.europa.eu/doclib/html/153814.htm it gives you HTML page with a redirect link, as for example you can extract it like this
import requests
from lxml import etree, html
tree = html.fromstring(requests.get('https://trade.ec.europa.eu/doclib/html/153814.htm').text)
print(tree.xpath('.//a/#href')[0])
Output will be
https://trade.ec.europa.eu/doclib/docs/2015/september/tradoc_153814.pdf
I've been creating a program with a variety of uses. I call it the Electronic Database with Direct Yield (EDDY). One thing that I have been having the most trouble with is EDDY's google search capabilities. EDDY will ask the user to give an input. EDDY will then edit the input slightly by replacing any spaces (' ') with plus signs ('+'), then go to the resulting url (without opening a browser). It then copies the html from the webpage and is SUPPOSED to give the results and descriptions of the site, and to specify, without the HTML code.
This is what I have so far.
import urllib
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import requests
def cleanup(url):
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
length = len(soup.prettify()) - 1
print(soup.prettify()[16800:length])
print(soup.title.text)
print(soup.body.text)
def eddysearch():
headers = {'User-Agent': 'Chrome.exe'}
reg_url = "http://www.google.com/search?q="
print("Ready for query")
query = input()
if(query != "quit"):
print("Searching for keyword: " + query)
print("Please wait...")
search = urllib.parse.quote_plus(query)
url = reg_url + search
req = Request(url=url, headers=headers)
html = urlopen(req).read()
cleanup(url)
eddysearch()
eddysearch()
Can anyone help me out? Thanks in advance!
hIf you dont want to use an SSL certificate, you can do .read()
# Python 2.7.x
import urllib
url = "http://stackoverflow.com"
f = urllib.urlopen(url)
print f.read()
#Python 3.x
import urllib.request
url = 'http://www.stackoverflow.com'
f = urllib.request.urlopen(url)
print(f.read())
i've tried with with urllib and request library but the data in fragment was not written in .html file. help me please :(
Here with the request
url = 'https://xxxxxxxxxxx.co.jp/InService/delivery/#/V=2/partsList/Element.PartsList%3A%3AVj0xfnsicklkIjoiQzEtQlVMTERPWkVSLUxfSVNfQzNfLl9CVUxMRE9aRVItTF8uXzgwXy5fRDg1RVNTLTJfLl9LSSIsIm9wIjpbIkMxLUJVTExET1pFUi1MX0lTX0MzXy5fQlVMTERPWkVSLUxfLl84MF8uX0Q4NUVTUy0yXy5fS0kiLCJJU19QQl8uX0Q4NUVTUy0yXy5fS0ktMDAwMDMiLCJJU19QQl8uX0Q4NUVTUy0yXy5fS0ktMDAwMDNfLl9BMCIsIlBMX0MxLUJVTExET1pFUi1MX0FDXy5fRDg1RVNTLTJfLl9LSS0wMDAwM18uX0EwMDEwMDEwIl0sIm5uIjoyMTQsInRzIjoxNTc5ODM0OTIwMDE5fQ?filterId=Product%3A%3AVj0xfnsicklkIjoiUk9PVCBQUk9EVUNUIiwib3AiOlsiUk9PVCBQUk9EVUNUIiwiQzEtQlVMTERPWkVSLUwiLCJDMl8uX0JVTExET1pFUi1MXy5fODAiLCJDM18uX0JVTExET1pFUi1MXy5fODBfLl9EODVFU1MtMl8uX0tJIl0sIm5uIjo2OTcsInRzIjoxNTc2NTY0MjMwMDg1fQ&bomFilterState=false'
response = requests.get(url)
print(response)
here with the urllib
url = 'https://xxxxxxx.co.jp/InService/delivery/?view=print#/V=2/partsList/Element.PartsList::Vj0xfnsicklkIjoiQzEtQlVMTERPWkVSLUxfSVNfQzNfLl9CVUxMRE9aRVItTF8uXzgwXy5fRDg1RVNTLTJfLl9LSSIsIm9wIjpbIkMxLUJVTExET1pFUi1MX0lTX0MzXy5fQlVMTERPWkVSLUxfLl84MF8uX0Q4NUVTUy0yXy5fS0kiLCJJU19QQl8uX0Q4NUVTUy0yXy5fS0ktMDAwMDMiLCJJU19QQl8uX0Q4NUVTUy0yXy5fS0ktMDAwMDNfLl9BMCIsIlBMX0MxLUJVTExET1pFUi1MX0FDXy5fRDg1RVNTLTJfLl9LSS0wMDAwM18uX0EwMDEwMDIwIl0sIm5uIjoyMjUsInRzIjoxNTgwMDk1MDYzNjIyfQ?filterId=Product::Vj0xfnsicklkIjoiUk9PVCBQUk9EVUNUIiwib3AiOlsiUk9PVCBQUk9EVUNUIiwiQzEtQlVMTERPWkVSLUwiLCJDMl8uX0JVTExET1pFUi1MXy5fODAiLCJDM18uX0JVTExET1pFUi1MXy5fODBfLl9EODVFU1MtMl8uX0tJIl0sIm5uIjo2OTcsInRzIjoxNTc2NTY0MjMwMDg1fQ&bomFilterState=false'
request = urllib.request.Request(url)
string = '%s:%s' % ('xx','xx')
base64string = base64.standard_b64encode(string.encode('utf-8'))
request.add_header("Authorization", "Basic %s" % base64string.decode('utf-8'))
u = urllib.request.urlopen(request)
webContent = u.read()
here is home of the web page (url:https://xxxxxx.co.jp/InService/delivery/#/V=2/home)
and here is the page that i want to get the data (url: https://xxxxxxx.co.jp/InService/delivery/?view=print#/V=2/partsList/Element.PartsList::Vj0xfnsicklkIjoiQzE...)
so every i request the web page like in the 2 picture, the html content is must be the html in picture 1 because in picture 2 is the fragment
If all you would like is the html of the webpage, just use requests as you have in the first example, except instead of print(response) use print(response.content).
To save it into a file use:
import requests
url = 'https://xxxxxxx.co.jp/InService/delivery/?view=print#/V=2/partsList/Element.PartsList::Vj0xfnsicklkIjoiQzEtQlVMTERPWkVSLUxfSVNfQzNfLl9CVUxMRE9aRVItTF8uXzgwXy5fRDg1RVNTLTJfLl9LSSIsIm9wIjpbIkMxLUJVTExET1pFUi1MX0lTX0MzXy5fQlVMTERPWkVSLUxfLl84MF8uX0Q4NUVTUy0yXy5fS0kiLCJJU19QQl8uX0Q4NUVTUy0yXy5fS0ktMDAwMDMiLCJJU19QQl8uX0Q4NUVTUy0yXy5fS0ktMDAwMDNfLl9BMCIsIlBMX0MxLUJVTExET1pFUi1MX0FDXy5fRDg1RVNTLTJfLl9LSS0wMDAwM18uX0EwMDEwMDIwIl0sIm5uIjoyMjUsInRzIjoxNTgwMDk1MDYzNjIyfQ?filterId=Product::Vj0xfnsicklkIjoiUk9PVCBQUk9EVUNUIiwib3AiOlsiUk9PVCBQUk9EVUNUIiwiQzEtQlVMTERPWkVSLUwiLCJDMl8uX0JVTExET1pFUi1MXy5fODAiLCJDM18uX0JVTExET1pFUi1MXy5fODBfLl9EODVFU1MtMl8uX0tJIl0sIm5uIjo2OTcsInRzIjoxNTc2NTY0MjMwMDg1fQ&bomFilterState=false'
with open("output.html", 'w+') as f:
response = requests.get(url)
f.write(response.content)
If you need a certain part of the webpage, use BeautifulSoup.
import requests
from bs4 import BeautifulSoup
url = 'https://xxxxxxx.co.jp/InService/delivery/?view=print#/V=2/partsList/Element.PartsList::Vj0xfnsicklkIjoiQzEtQlVMTERPWkVSLUxfSVNfQzNfLl9CVUxMRE9aRVItTF8uXzgwXy5fRDg1RVNTLTJfLl9LSSIsIm9wIjpbIkMxLUJVTExET1pFUi1MX0lTX0MzXy5fQlVMTERPWkVSLUxfLl84MF8uX0Q4NUVTUy0yXy5fS0kiLCJJU19QQl8uX0Q4NUVTUy0yXy5fS0ktMDAwMDMiLCJJU19QQl8uX0Q4NUVTUy0yXy5fS0ktMDAwMDNfLl9BMCIsIlBMX0MxLUJVTExET1pFUi1MX0FDXy5fRDg1RVNTLTJfLl9LSS0wMDAwM18uX0EwMDEwMDIwIl0sIm5uIjoyMjUsInRzIjoxNTgwMDk1MDYzNjIyfQ?filterId=Product::Vj0xfnsicklkIjoiUk9PVCBQUk9EVUNUIiwib3AiOlsiUk9PVCBQUk9EVUNUIiwiQzEtQlVMTERPWkVSLUwiLCJDMl8uX0JVTExET1pFUi1MXy5fODAiLCJDM18uX0JVTExET1pFUi1MXy5fODBfLl9EODVFU1MtMl8uX0tJIl0sIm5uIjo2OTcsInRzIjoxNTc2NTY0MjMwMDg1fQ&bomFilterState=false'
response = BeautifulSoup(requests.get(url).content)
use inspect element and find the Tag of the table that you want in the second image, eg. https://imgur.com/a/pGbCCFy.
then use:
found = response.find('div', attrs={"class":"x-carousel__body no-scroll"}).find_all('ul')
For the ebay example I linked above.
This should return that table which you can then do whatever you like with.
I am trying to extract all the images from below URL, However, I don't understand the HTTP Error 403: Forbidden, Can it be taken care of during error handling, or simply the URL cant be scraped due to limitations?
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib.request
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html)
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
images = [img for img in soup.findAll('img')]
print (str(len(images)) + "images found.")
print("downloading to current directory ")
#compile our unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
filename=each.split('/')[-1]
urllib.request.urlretrieve(each,filename)
return image_links
get_images("https://opensignal.com/reports/2019/04/uk/mobile-network-experience")
some sites need you to specify User-Agent header
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import urllib.request
def make_soup(url):
site = url
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site, headers=hdr)
page = urlopen(req)
return BeautifulSoup(page)
You can use this function for image scraping. using img tag along not useful nowadays .we can implement something like below, that will fulfill the requirement. It's not relay on any tags so wherever image link is present it will grab it.
def extract_ImageUrl(soup_chunk):
urls_found = []
for tags in soup_chunk.find_all():
attributes = tags.attrs
if str(attributes).__contains__('http'):
for links in attributes.values():
if re.match('http.*\.jpg|png',str(links)):
if len(str(links).split()) <=1:
urls_found.append(links)
else:
link = [i.strip() for i in str(links).split() if re.match('http.*\.jpg|png',str(i))]
urls_found = urls_found + link
print("Found {} image links".format(len(urls_found)))
return urls_found
It's an initial thought, require updates to make it very better.