I am trying to get the redirected URL that https://trade.ec.europa.eu/doclib/html/153814.htm leads to (a pdf file).
I've so far tried
r = requests.get('https://trade.ec.europa.eu/doclib/html/153814.htm', allow_redirects = True)
print(r.url)
and it outputs the same old URL. I need the redirected URL which is https://trade.ec.europa.eu/doclib/docs/2015/september/tradoc_153814.pdf
Please try this code to see if it works for you
import urllib.request
import re
import requests
import PyPDF2
import io
from requests_html import HTMLSession
from urllib.parse import urlparse
from PyPDF2 import PdfFileReader
# Get Domain Name With urlparse
url = "https://trade.ec.europa.eu/doclib/html/153814.htm"
parsed_url = urlparse(url)
domain = parsed_url.scheme + "://" + parsed_url.netloc
# Get URL
session = HTMLSession()
r = session.get(url)
# Extract Links
jlinks = r.html.xpath('//a/#href')
# Remove bad links and replace relative path for absolute path
updated_links = []
for link in jlinks:
if re.search(".*#.*|.*javascript:.*|.*tel:.*",link):
link = ""
elif re.search("^(?!http).*",link):
link = domain + link
updated_links.append(link)
else:
updated_links.append(link)
r = requests.get(updated_links[0])
f = io.BytesIO(r.content)
reader = PdfFileReader(f)
contents = reader.getPage(0).extractText()
print(contents)
I think you should get a redirect link yourself (didn't found any way to do this with redirect), when you enter https://trade.ec.europa.eu/doclib/html/153814.htm it gives you HTML page with a redirect link, as for example you can extract it like this
import requests
from lxml import etree, html
tree = html.fromstring(requests.get('https://trade.ec.europa.eu/doclib/html/153814.htm').text)
print(tree.xpath('.//a/#href')[0])
Output will be
https://trade.ec.europa.eu/doclib/docs/2015/september/tradoc_153814.pdf
Related
I am trying to download pdfs from several pdf urls.
An example: https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf
This url directly opens into the PDF on my browser.
However, when I use this code to download it using the link, it returns an HTML file given below.
link = "https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf"
urllib.request.urlretrieve(link, f"/content/drive/MyDrive/Research/pdfs/1.pdf")
The resulting "pdf" file or HTML code file is downloaded instead:
How do I solve this issue? Appreciate any help, thanks!
You can use BeautifulSoup or lxml to find <iframe> and get src - and use it to download file
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup as BS
url = 'https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf'
response = urllib.request.urlopen(url)
soup = BS(response.read(), 'html.parser')
iframe = soup.find('iframe')
url = iframe['src']
filename = urllib.parse.unquote(url)
filename = filename.rsplit('/', 1)[-1]
urllib.request.urlretrieve(url, filename)
Eventually you can check few file to see if all use the same https://d2x0djib3vzbzj.cloudfront.net/ and simply replace it in url.
import urllib.request
import urllib.parse
url = 'https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf'
url = url.replace('https://www.fasb.org/page/showpdf?path=',
'https://d2x0djib3vzbzj.cloudfront.net/')
filename = urllib.parse.unquote(url)
filename = filename.rsplit('/', 1)[-1]
urllib.request.urlretrieve(url, filename)
I didn't know how to put the title, so it's rather long. Feel free to edit it.
I am trying to scrape data from this site, but I can't figure out how to access the individual keys and values within the 'window.data' with beautiful soup.
I'd like to for example get the value of yyuid, birthday, etc.
The code is as such:
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
import re
username = "itsahardday"
url = "https://likee.video/#" + username # profile url - https://likee.video/account_name
def get_profile_html():
'''
Get profile data from HTML - https://likee.video/account_name
:return:
'''
response = urllib.request.urlopen(url)
soup = BeautifulSoup(response.read(), "html.parser")
results = soup.select_one("script:-soup-contains('userinfo')").string
print(results)
get_profile_html()
Preferable I would like to have it as JSON, but any solution is welcomed.
In advance, thank you for your help!
tweaked your code. return from the function.
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
import re
username = "itsahardday"
url = "https://likee.video/#" + username # profile url - https://likee.video/account_name
def get_profile_html():
'''
Get profile data from HTML - https://likee.video/account_name
:return:
'''
response = urllib.request.urlopen(url)
soup = BeautifulSoup(response.read(), "html.parser")
results = soup.select_one("script:-soup-contains('userinfo')").string
print(results)
return results # add return
res=get_profile_html() # save the result
then , convert to JSON
import json # import
json.loads(res.split(";")[0].split("window.data =")[1])['userinfo']
Hello Community I have a problem and I dont know how to solve it my problem is I write a script to crawl webpages for Images with BeautifuleSoup4 but I got the error (AttributeError: 'NoneType' object has no attribute 'group')
import re
import requests
from bs4 import BeautifulSoup
site = 'https://www.fotocommunity.de/natur/wolken/3144?sort=new'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img', {"src": True})
urls = [img["src"] for img in img_tags]
for url in urls:
filename = re.search(r'([\w_-]+[.](jpg|png))$', url)
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
Your regex is wrong. Use Python's internal urllib to do the heavyweight lifting instead of writing regexes if you're not familiar with them.
Use something like this (untested):
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlsplit # import this additional library
from os.path import basename # import this additional library
site = 'https://www.fotocommunity.de/natur/wolken/3144?sort=new'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
images_div = soup.find(id=re.compile(r"fcx-gallery-\w+")) # focus on the div containing the images
if img_tags: # test if img_tags has any data
img_tags = images_div.find_all('img', {"data-src": True}) # get all the images in that div
urls = [img["data-src"] for img in img_tags] # grab sources from data-source
for url in urls:
filename = basename(urlsplit(url).path) # use this instead of a regex
with open(filename, 'wb') as f: # filename is now a string
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
I am trying to get the full content of another website, or modify the links that are clicked on when people use other websites on my site in django?
import requests
import urllib.request
def one(request, myurl='google.com'):
url = 'http://' + myurl
r = requests.get(url)
return HttpResponse(r)
The outcome of requests.get is a Response [requests-doc] object, not a string. You can obtain the content with content [requests-doc]. For example:
import requests
import urllib.request
def one(request, myurl='google.com'):
url = 'http://' + myurl
r = requests.get(url)
return HttpResponse(
content=r.content,
content_type=r.headers.get('Content-Type'),
status=r.status_code
)
I am building the crawler in python and i have the list of href from the page.
Now i have the list of file extensions to download like
list = ['zip','rar','pdf','mp3']
How can i save the files from that url to local directory using python
EDIT:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.example.com/downlaod"
site = urllib2.urlopen(url)
html = site.read()
soup = BeautifulSoup(html)
list_urls = soup.find_all('a')
print list_urls[6]
Going by your posted example:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.example.com/downlaod"
site = urllib2.urlopen(url)
html = site.read()
soup = BeautifulSoup(html)
list_urls = soup.find_all('a')
print list_urls[6]
So, the URL you want to fetch next is presumably list_urls[6]['href'].
The first trick is that this might be a relative URL rather than absolute. So:
newurl = list_urls[6]['href']
absurl = urlparse.urljoin(site.url, newurl)
Also, you want to only fetch the file if it has the right extension, so:
if not absurl.endswith(extensions):
return # or break or whatever
But once you've decided what URL you want to download, it's no harder than your initial fetch:
page = urllib2.urlopen(absurl)
html = page.read()
path = urlparse.urlparse(absurl).path
name = os.path.basename(path)
with open(name, 'wb') as f:
f.write(html)
That's mostly it.
There are a few things you might want to add, but if so, you have to add them all manually. For example:
Look for a Content-disposition header with a suggested filename to use in place of the URL's basename.
copyfile from page to f instead of reading the whole thing into memory and then writeing it out.
Deal with existing files with the same name.
…
But that's the basics.
You can use python requests library as you have asked in question : http://www.python-requests.org
You can save file from url like this :
import requests
url='http://i.stack.imgur.com/0LJdh.jpg'
data=requests.get(url).content
filename="image.jpg"
with open(filename, 'wb') as f:
f.write(data)
solution using urllib3
import os
import urllib3
from bs4 import BeautifulSoup
import urllib.parse
url = "https://path/site"
site = urllib3.PoolManager()
html = site.request('GET', url)
soup = BeautifulSoup(html.data, "lxml")
list_urls = soup.find_all('a')
and then a recursive function to get all the files
def recursive_function(list_urls)
newurl = list_urls[0]['href']
absurl = url+newurl
list_urls.pop(0)
if absurl.endswith(extensions): # verify if contains the targeted extensions
page = urllib3.PoolManager()
html = site.request('GET', absurl)
name = os.path.basename(absurl)
with open(name, 'wb') as f:
f.write(html.data)
return recursive_function(list_urls)