I have a folder online where I have video files so I need to download videos from there to my local system so in order to do that I am using Bs4 , urllib modules , The downloaded files are not playable
Please have a look at my code .
from bs4 import BeautifulSoup
import urllib2
import random
import urllib
from urllib2 import urlopen
def download(url):
response = urllib.urlopen("http://ssdolutions/addadsfasdfulsdfaatadfae")
doc = response.read()
soup = BeautifulSoup(doc)
for link in soup.find_all('a'):
x = (link.get('href'))
name = random.randrange(1,10)
full_name = str(name) + ".mp4"
urllib.urlretrieve(url, full_name)
download("http://ssdolutions/addadsfasdfulsdfaatadfae")
Pass every link URL (stored in x variable) to urlretrieve call:
from bs4 import BeautifulSoup
import urllib2
import random
import urllib
import os
from urllib2 import urlopen
def download(url):
response = urllib.urlopen("http://ssdolutions/addadsfasdfulsdfaatadfae")
doc = response.read()
soup = BeautifulSoup(doc)
for link in soup.find_all('a'):
url = (link.get('href'))
# Extract filename from link URL
filename = os.path.basename(url)
file_data = os.path.splitext(filename)
if len(file_data) > 1:
file_ext = file_data[1]
# this will allow you to download links with *.mp4 extension only
if file_ext == ".mp4":
urllib.urlretrieve(url, filename)
download("http://ssdolutions/addadsfasdfulsdfaatadfae")
Related
I need a help with one code. I want to download pdf from all pages of "/#documentu", not only one. But I don't want to write all these links in code. It must be an automatic parsing. Here's the code:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import webbrowser
import urllib
import urllib.request
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
#Need to download from all pages that has a files and page "/#documentu"
#but without writing all links in code. It must be an automatic.
urlpage = "https://fasie.ru/programs/programma-innoshkolnik/#documentu"
#If there is no such folder, the script will create one automatically
folder_location = r'C:\Download'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(urlpage)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
#Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(urlpage,link['href'])).content)
path = r'C:\Download'
i = 1
for file_name in os.listdir(path):
base_name, ext = os.path.splitext(file_name)
abs_file_name = os.path.join(path, file_name)
new_abs_file_name = os.path.join(path, str(i) + ext)
os.rename(abs_file_name, new_abs_file_name)
i += 1
Need help.
Need an automatic parser
I am trying to download pdfs from several pdf urls.
An example: https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf
This url directly opens into the PDF on my browser.
However, when I use this code to download it using the link, it returns an HTML file given below.
link = "https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf"
urllib.request.urlretrieve(link, f"/content/drive/MyDrive/Research/pdfs/1.pdf")
The resulting "pdf" file or HTML code file is downloaded instead:
How do I solve this issue? Appreciate any help, thanks!
You can use BeautifulSoup or lxml to find <iframe> and get src - and use it to download file
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup as BS
url = 'https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf'
response = urllib.request.urlopen(url)
soup = BS(response.read(), 'html.parser')
iframe = soup.find('iframe')
url = iframe['src']
filename = urllib.parse.unquote(url)
filename = filename.rsplit('/', 1)[-1]
urllib.request.urlretrieve(url, filename)
Eventually you can check few file to see if all use the same https://d2x0djib3vzbzj.cloudfront.net/ and simply replace it in url.
import urllib.request
import urllib.parse
url = 'https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf'
url = url.replace('https://www.fasb.org/page/showpdf?path=',
'https://d2x0djib3vzbzj.cloudfront.net/')
filename = urllib.parse.unquote(url)
filename = filename.rsplit('/', 1)[-1]
urllib.request.urlretrieve(url, filename)
I am trying to get the redirected URL that https://trade.ec.europa.eu/doclib/html/153814.htm leads to (a pdf file).
I've so far tried
r = requests.get('https://trade.ec.europa.eu/doclib/html/153814.htm', allow_redirects = True)
print(r.url)
and it outputs the same old URL. I need the redirected URL which is https://trade.ec.europa.eu/doclib/docs/2015/september/tradoc_153814.pdf
Please try this code to see if it works for you
import urllib.request
import re
import requests
import PyPDF2
import io
from requests_html import HTMLSession
from urllib.parse import urlparse
from PyPDF2 import PdfFileReader
# Get Domain Name With urlparse
url = "https://trade.ec.europa.eu/doclib/html/153814.htm"
parsed_url = urlparse(url)
domain = parsed_url.scheme + "://" + parsed_url.netloc
# Get URL
session = HTMLSession()
r = session.get(url)
# Extract Links
jlinks = r.html.xpath('//a/#href')
# Remove bad links and replace relative path for absolute path
updated_links = []
for link in jlinks:
if re.search(".*#.*|.*javascript:.*|.*tel:.*",link):
link = ""
elif re.search("^(?!http).*",link):
link = domain + link
updated_links.append(link)
else:
updated_links.append(link)
r = requests.get(updated_links[0])
f = io.BytesIO(r.content)
reader = PdfFileReader(f)
contents = reader.getPage(0).extractText()
print(contents)
I think you should get a redirect link yourself (didn't found any way to do this with redirect), when you enter https://trade.ec.europa.eu/doclib/html/153814.htm it gives you HTML page with a redirect link, as for example you can extract it like this
import requests
from lxml import etree, html
tree = html.fromstring(requests.get('https://trade.ec.europa.eu/doclib/html/153814.htm').text)
print(tree.xpath('.//a/#href')[0])
Output will be
https://trade.ec.europa.eu/doclib/docs/2015/september/tradoc_153814.pdf
I'm unable to save/download the images at the location. I can't figure out the problem although the code seems right.
I'm using requests library for scraping the images.
import os
import urllib
import urllib.request
from bs4 import BeautifulSoup
import requests
import re
from lxml.html import fromstring
r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")
title = fromstring(r.content).findtext('.//title')
#print(title)
newPath = r'C:\Users\Vicky\Desktop\ScrappedImages\ ' + title
for link in soup.find_all('img'):
image = link.get('src')
if 'http' in image:
print(image)
imageName = os.path.split(image)[1]
print(imageName)
r2 = requests.get(image)
if not os.path.exists(newPath):
os.makedirs(newPath)
with open(imageName, "wb") as f:
f.write(r2.content)
Try wrapping your r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci") in a try: or while: statement to make sure that the website you are scraping is returning a 200 response, it could be that the website is timing out or not serving your request.
import os
from bs4 import BeautifulSoup
import urllib
import requests
import urlparse
from lxml.html import fromstring
r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")
for link in soup.find_all('img'):
image = link.get('src')
if bool(urlparse.urlparse(image).netloc):
print(image)
imageName = image[image.rfind("/")+1:]
print(imageName)
urllib.urlretrieve(image,imageName)
I'm trying to write a Python script to download images from any website. It is working, but inconsistently. Specifically, find_all("img") is not doing so for the second url. The script is:
# works for http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/
# but not http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
html = requests.get("http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/")
soup = BeautifulSoup(html.text, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
The images are rendered with JavaScript on the page that is failing.
First render the page with dryscrape
(If you don't want to use dryscrape see Web-scraping JavaScript page with Python )
e.g.
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
session = dryscrape.Session()
session.visit("http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/")
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
But I would also check that you have an absolute URL not a relative one:
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
from urllib.parse import urljoin
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
base = "http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/"
session = dryscrape.Session()
session.visit(base)
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
if url.startswith( 'http' ):
absoute = url
else:
absoute = urljoin(base, url)
print (absoute)
url_to_image(absoute, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1