Download Images to Local Folder from Urls in CSV file in Python - python

I made the program to get the Images from the Urls in the CSV file and want to download it in the Local Folder in Python but the program showing the below error
"TypeError: cannot use a string pattern on a bytes-like object"
Please check the Code in below
import pandas as pd
import urllib.request
def url_to_jpg(i, url , File_Path):
filename = 'image_{}.jpg'.format(i)
full_path = '{}{}'.format(File_Path, filename)
urllib.request.urlretrieve(url, full_path)
print('{} saved.'.format(filename))
return None
FileName = "C:/Users/IT City/Desktop/Kwiat-USA/KavantaCSV.csv"
File_Path = "C:/Users/IT City/Desktop/Kwiat-USA/images"
urls = pd.read_csv(FileName)
for i , url in enumerate(urls.values):
url_to_jpg(i, url , File_Path)
Need your Immediate help. Help will be highly Appreciated.
Thank You

Maybe you have to decode it?
-or you can simply use the requests library.
r = requests.get(url)
f = open(filename, "wb+")
f.write(r.content)
f.close()

The url being passed to urlretrieve is an array. the urls.values gives a 2d-array.
it always helps if the stack trace is also posted.

Related

Python - Scraping a PDF file from a URL

I want to scrape pdf files from this site
https://www.sigmaths.net/Reader.php?var=manuels/ph/physique_pilote_7b.pdf
I tried this code for that but it doesn't work. Can anybody tell me why, please?
res = requests.get('https://www.sigmaths.net/Reader.php?var=manuels/ph/physique_7b.pdf')
with open('C:\\Users\\sioud\\Desktop\\Manuels scolaires TN\\1\\test.pdf', 'wb') as f:
f.write(ress.content)
res = requests.get('https://www.sigmaths.net/manuels/ph/physique_7b.pdf',stream=True)
with open('test.pdf', 'wb') as f:
f.write(res.content)
your url is pointing to a reader https://www.sigmaths.net/Reader.php?var=manuels/ph/physique_7b.pdf, remove the 'reader.php?var= for the actual pdf
You can also use urlretrieve.
Check out my solution code.
from urllib.request import urlretrieve
pdfurl = u"https://www.sigmaths.net/manuels/ph/physique_7b.pdf";
urlretrieve(pdfurl, "test.pdf")
And you will find the required pdf download under the name test.pdf

Downloading multiple files with requests in Python

Currently im facing following problem:
I have 3 download links in a list. Only the last file in the list is downloaded completely.
The others have a file size of one kilobyte.
Code:
from requests import get
def download(url, filename):
with open(filename, "wb") as file:
response = get(url, stream=True)
file.write(response.content)
for link in f:
url = link
split_url = url.split("/")
filename = split_url[-1]
filename = filename.replace("\n", "")
download(url,filename)
The result looks like this:
Result
How do I make sure that all files are downloaded correctly?
All links are direct download links.
Thanks in advance!
EDIT:
I discovered it only happens when I read the links from the .txt
If I create the list in python like this:
links = ["http://ipv4.download.thinkbroadband.com/20MB.zip",
"http://ipv4.download.thinkbroadband.com/10MB.zip",
"http://ipv4.download.thinkbroadband.com/5MB.zip"]
... the problem doesnt appear.
reproduceable example:
from requests import get
def download(url, filename):
with open(filename, "wb") as file:
response = get(url, stream = True)
file.write(response.content)
f = open('links.txt','r')
for link in f:
url = link
split_url = url.split("/")
filename = split_url[-1]
filename = filename.replace("\n", "")
download(url,filename)
content of links.txt:
http://ipv4.download.thinkbroadband.com/20MB.zip
http://ipv4.download.thinkbroadband.com/10MB.zip
http://ipv4.download.thinkbroadband.com/5MB.zip
url = url.replace("\n", "")
solved it!

Download file from URL and save it in a folder Python

I've a lot of URL with file types .docx and .pdf I want to run a python script that downloads them from the URL and saves it in a folder. Here is what I've done for a single file I'll add them to a for loop:
response = requests.get('http://wbesite.com/Motivation-Letter.docx')
with open("my_file.docx", 'wb') as f:
f.write(response.content)
but the my_file.docx that it is saving is only 266 bytes and is corrupt but the URL is fine.
UPDATE:
Added this code and it works but I want to save it in a new folder.
import os
import shutil
import requests
def download_file(url, folder_name):
local_filename = url.split('/')[-1]
path = os.path.join("/{}/{}".format(folder_name, local_filename))
with requests.get(url, stream=True) as r:
with open(path, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
Try using stream option:
import os
import requests
def download(url: str, dest_folder: str):
if not os.path.exists(dest_folder):
os.makedirs(dest_folder) # create folder if it does not exist
filename = url.split('/')[-1].replace(" ", "_") # be careful with file names
file_path = os.path.join(dest_folder, filename)
r = requests.get(url, stream=True)
if r.ok:
print("saving to", os.path.abspath(file_path))
with open(file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 8):
if chunk:
f.write(chunk)
f.flush()
os.fsync(f.fileno())
else: # HTTP status code 4XX/5XX
print("Download failed: status code {}\n{}".format(r.status_code, r.text))
download("http://website.com/Motivation-Letter.docx", dest_folder="mydir")
Note that mydir in example above is the name of folder in current working directory. If mydir does not exist script will create it in current working directory and save file in it. Your user must have permissions to create directories and files in current working directory.
You can pass an absolute file path in dest_folder, but check permissions first.
P.S.: avoid asking multiple questions in one post
try:
import urllib.request
urllib.request.urlretrieve(url, filename)

Download and save PDF file with Python requests module

I am trying to download a PDF file from a website and save it to disk. My attempts either fail with encoding errors or result in blank PDFs.
In [1]: import requests
In [2]: url = 'http://www.hrecos.org//images/Data/forweb/HRTVBSH.Metadata.pdf'
In [3]: response = requests.get(url)
In [4]: with open('/tmp/metadata.pdf', 'wb') as f:
...: f.write(response.text)
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-4-4be915a4f032> in <module>()
1 with open('/tmp/metadata.pdf', 'wb') as f:
----> 2 f.write(response.text)
3
UnicodeEncodeError: 'ascii' codec can't encode characters in position 11-14: ordinal not in range(128)
In [5]: import codecs
In [6]: with codecs.open('/tmp/metadata.pdf', 'wb', encoding='utf8') as f:
...: f.write(response.text)
...:
I know it is a codec problem of some kind but I can't seem to get it to work.
You should use response.content in this case:
with open('/tmp/metadata.pdf', 'wb') as f:
f.write(response.content)
From the document:
You can also access the response body as bytes, for non-text requests:
>>> r.content
b'[{"repository":{"open_issues":0,"url":"https://github.com/...
So that means: response.text return the output as a string object, use it when you're downloading a text file. Such as HTML file, etc.
And response.content return the output as bytes object, use it when you're downloading a binary file. Such as PDF file, audio file, image, etc.
You can also use response.raw instead. However, use it when the file which you're about to download is large. Below is a basic example which you can also find in the document:
import requests
url = 'http://www.hrecos.org//images/Data/forweb/HRTVBSH.Metadata.pdf'
r = requests.get(url, stream=True)
with open('/tmp/metadata.pdf', 'wb') as fd:
for chunk in r.iter_content(chunk_size):
fd.write(chunk)
chunk_size is the chunk size which you want to use. If you set it as 2000, then requests will download that file the first 2000 bytes, write them into the file, and do this again, again and again, unless it finished.
So this can save your RAM. But I'd prefer use response.content instead in this case since your file is small. As you can see use response.raw is complex.
Relates:
How to download large file in python with requests.py?
How to download image using requests
In Python 3, I find pathlib is the easiest way to do this. Request's response.content marries up nicely with pathlib's write_bytes.
from pathlib import Path
import requests
filename = Path('metadata.pdf')
url = 'http://www.hrecos.org//images/Data/forweb/HRTVBSH.Metadata.pdf'
response = requests.get(url)
filename.write_bytes(response.content)
You can use urllib:
import urllib.request
urllib.request.urlretrieve(url, "filename.pdf")
Please note I'm a beginner. If My solution is wrong, please feel free to correct and/or let me know. I may learn something new too.
My solution:
Change the downloadPath accordingly to where you want your file to be saved. Feel free to use the absolute path too for your usage.
Save the below as downloadFile.py.
Usage: python downloadFile.py url-of-the-file-to-download new-file-name.extension
Remember to add an extension!
Example usage: python downloadFile.py http://www.google.co.uk google.html
import requests
import sys
import os
def downloadFile(url, fileName):
with open(fileName, "wb") as file:
response = requests.get(url)
file.write(response.content)
scriptPath = sys.path[0]
downloadPath = os.path.join(scriptPath, '../Downloads/')
url = sys.argv[1]
fileName = sys.argv[2]
print('path of the script: ' + scriptPath)
print('downloading file to: ' + downloadPath)
downloadFile(url, downloadPath + fileName)
print('file downloaded...')
print('exiting program...')
Generally, this should work in Python3:
import urllib.request
..
urllib.request.get(url)
Remember that urllib and urllib2 don't work properly after Python2.
If in some mysterious cases requests don't work (happened with me), you can also try using
wget.download(url)
Related:
Here's a decent explanation/solution to find and download all pdf files on a webpage:
https://medium.com/#dementorwriter/notesdownloader-use-web-scraping-to-download-all-pdfs-with-python-511ea9f55e48
regarding Kevin answer to write in a folder tmp, it should be like this:
with open('./tmp/metadata.pdf', 'wb') as f:
f.write(response.content)
he forgot . before the address and of-course your folder tmp should have been created already

Python 2.7 download images

I'm using python 2.7 and pycharm is my editor. What i'm trying to do is have python go to a site and download an image from that site and save it to my directory. Currently I have no errors but i don't think its downloading because the file is not showing in my directory.
import random
import urllib2
def download_web_image(url):
name = random.randrange(1,1000)
full_name = str(name) + ".jpg"
urllib2.Request(url, full_name)
download_web_image("www.example.com/page1/picture.jpg")
This will do the trick. The rest can stay the same, just edit your function to include the two lines I have added.
def download_web_image(url):
name = random.randrange(1,1000)
full_name = str(name) + ".jpg"
request = urllib2.Request(url)
img = urllib2.urlopen(request).read()
with open (full_name, 'w') as f: f.write(img)
Edit 1:
Exact code as requested in comments.
import urllib2
def download_web_image(url):
request = urllib2.Request(url)
img = urllib2.urlopen(request).read()
with open ('test.jpg', 'w') as f: f.write(img)
download_web_image("http://upload.wikimedia.org/wikipedia/commons/8/8c/JPEG_example_JPG_RIP_025.jpg")
You are simply creating a Request but you are not downloading the image. Try the following instead:
urllib.urlretrieve(url, os.path.join(os.getcwd(), full_name)) # download and save image
Or try the requests library:
import requests
image = requests.get("www.example.com/page1/picture.jpg")
with open('picture.jpg', 'wb') as f:
f.write(image.content)

Categories

Resources