Currently im facing following problem:
I have 3 download links in a list. Only the last file in the list is downloaded completely.
The others have a file size of one kilobyte.
Code:
from requests import get
def download(url, filename):
with open(filename, "wb") as file:
response = get(url, stream=True)
file.write(response.content)
for link in f:
url = link
split_url = url.split("/")
filename = split_url[-1]
filename = filename.replace("\n", "")
download(url,filename)
The result looks like this:
Result
How do I make sure that all files are downloaded correctly?
All links are direct download links.
Thanks in advance!
EDIT:
I discovered it only happens when I read the links from the .txt
If I create the list in python like this:
links = ["http://ipv4.download.thinkbroadband.com/20MB.zip",
"http://ipv4.download.thinkbroadband.com/10MB.zip",
"http://ipv4.download.thinkbroadband.com/5MB.zip"]
... the problem doesnt appear.
reproduceable example:
from requests import get
def download(url, filename):
with open(filename, "wb") as file:
response = get(url, stream = True)
file.write(response.content)
f = open('links.txt','r')
for link in f:
url = link
split_url = url.split("/")
filename = split_url[-1]
filename = filename.replace("\n", "")
download(url,filename)
content of links.txt:
http://ipv4.download.thinkbroadband.com/20MB.zip
http://ipv4.download.thinkbroadband.com/10MB.zip
http://ipv4.download.thinkbroadband.com/5MB.zip
url = url.replace("\n", "")
solved it!
Related
I have been trying to scrape some images using Beautifulsoup in Python and I am facing some problems, so the thing is that I am successfully able to scrape the link as well as store it in the folder but the images are in an unsupported format.
res = requests.get('https://books.toscrape.com/')
res.raise_for_status()
file = open('op.html', 'wb')
for i in res.iter_content(10000):
file.write(i)
os.makedirs('images', exist_ok=True)
newfile=open("op.html",'rb')
data=newfile.read()
soup=BeautifulSoup(data,'html.parser')
for link in soup.find_all('img'):
ll=link.get('src')
ima = open(os.path.join('images', os.path.basename(ll)), 'wb')
for down in res.iter_content(1000):
ima.write(down)
It says file format not supported even though it's in a jpeg format
output image in a folder
This line for down in res.iter_content(1000): is not iterating the image from ll - it is reiterating the html result. Your OS may recognize the file from the extension (.jpeg), but this is only because of the filename - not the content (which is not JPEG, but HTML, and hence the error).
You should make another request for the image itself, so it can be fetched and stored:
for link in soup.find_all('img'):
ll = link.get('src')
img_rs = requests.get(os.path.join('https://books.toscrape.com/', ll)) # <-- this line
ima = open(os.path.join('images', os.path.basename(ll)), 'wb')
for down in img_rs.iter_content(1000): # <-- and iterate on the result
ima.write(down)
The reason for saving the HTML is obscure. So, ignoring that part of the code in question, it comes down to this:
import requests
from os.path import join, basename
from bs4 import BeautifulSoup as BS
from urllib.parse import urljoin
URL = 'https://books.toscrape.com'
TARGET_DIR = '/tmp'
with requests.Session() as session:
(r := session.get(URL)).raise_for_status()
for image in BS(r.text, 'lxml').find_all('img'):
src = image['src']
(r := session.get(urljoin(URL, src), stream=True)).raise_for_status()
with open(join(TARGET_DIR, basename(src)), 'wb') as t:
for chunk in r.iter_content(chunk_size=8192):
t.write(chunk)
In terms of performance, this can be significantly enhanced by multithreading
Your problem is that after you find the URL of the image you don't do anything with it and instead you try to save the whole inital request which is just the html file of the whole website. Try something like this instead:
base_url = 'https://books.toscrape.com/'
res = requests.get('https://books.toscrape.com/')
res.raise_for_status()
file = open('op.html', 'wb')
for i in res.iter_content(10000):
file.write(i)
os.makedirs('images', exist_ok=True)
newfile=open("op.html",'rb')
data=newfile.read()
soup=BeautifulSoup(data,'html.parser')
for link in soup.find_all('img'):
ll=link.get('src')
ima = os.path.join('images', os.path.basename(ll))
current_img = os.path.join(base_url, ll)
img_res = requests.get(current_img, stream = True)
with open(ima, 'wb') as f:
shutil.copyfileobj(img_res.raw, f)
del img_res
The goal is to download GTFS data through python web scraping, starting with https://transitfeeds.com/p/agence-metropolitaine-de-transport/129/latest/download
Currently, I'm using requests like so:
def download(url):
fpath = "prov/city/GTFS"
r = requests.get(url)
if r.ok:
print("Saving file.")
open(fpath, "wb").write(r.content)
else:
print("Download failed.")
The results of requests.content of the above url unfortunately renders the following:
You can see the files of interest within the output (e.g. stops.txt) but how might I access them to read/write?
I fear you're trying to read a zip file with a text editor, perhaps you should try using the "zipfile" module.
The following worked:
def download(url):
fpath = "path/to/output/"
f = requests.get(url, stream = True, headers = headers)
if f.ok:
print("Saving to {}".format(fpath))
g=open(fpath+'output.zip','wb')
g.write(f.content)
g.close()
else:
print("Download failed with error code: ", f.status_code)
You need to write this file into a zip.
import requests
url = "https://transitfeeds.com/p/agence-metropolitaine-de-transport/129/latest/download"
fname = "gtfs.zip"
r = requests.get(url)
open(fname, "wb").write(r.content)
Now fname exists and has several text files inside. If you want to programmatically extract this zip and then read the content of a file, for example stops.txt, then you need first to extract a single file, or simply extractall.
import zipfile
# this will extract only a single file, and
# raise a KeyError if the file is missing from the archive
zipfile.ZipFile(fname).extract("stops.txt")
# this will extract all the files found from the archive,
# overwriting files in the process
zipfile.ZipFile(fname).extractall()
Now you just need to work with your file(s).
thefile = "stops.txt"
# just plain text
text = open(thefile).read()
# csv file
import csv
reader = csv.reader(open(thefile))
for row in reader:
...
I made the program to get the Images from the Urls in the CSV file and want to download it in the Local Folder in Python but the program showing the below error
"TypeError: cannot use a string pattern on a bytes-like object"
Please check the Code in below
import pandas as pd
import urllib.request
def url_to_jpg(i, url , File_Path):
filename = 'image_{}.jpg'.format(i)
full_path = '{}{}'.format(File_Path, filename)
urllib.request.urlretrieve(url, full_path)
print('{} saved.'.format(filename))
return None
FileName = "C:/Users/IT City/Desktop/Kwiat-USA/KavantaCSV.csv"
File_Path = "C:/Users/IT City/Desktop/Kwiat-USA/images"
urls = pd.read_csv(FileName)
for i , url in enumerate(urls.values):
url_to_jpg(i, url , File_Path)
Need your Immediate help. Help will be highly Appreciated.
Thank You
Maybe you have to decode it?
-or you can simply use the requests library.
r = requests.get(url)
f = open(filename, "wb+")
f.write(r.content)
f.close()
The url being passed to urlretrieve is an array. the urls.values gives a 2d-array.
it always helps if the stack trace is also posted.
Drawing inspiration from this post, I am trying to download a bunch of xml files in batch from a website:
import urllib2
url='http://ratings.food.gov.uk/open-data/'
f = urllib2.urlopen(url)
data = f.read()
with open("C:\Users\MyName\Desktop\data.zip", "wb") as code:
code.write(data)
The zip file is created within seconds, but as I attempt to access it, an error window comes up:
Windows cannot open the folder.
The Compressed (zipped) Folder "C:\Users\MyName\Desktop\data.zip" is invalid.
What am I doing wrong here?
you are not opening file handles inside the zip file:
import urllib2
from bs4 import BeautifulSoup
import zipfile
url='http://ratings.food.gov.uk/open-data/'
fileurls = []
f = urllib2.urlopen(url)
mainpage = f.read()
soup = BeautifulSoup(mainpage, 'html.parser')
tablewrapper = soup.find(id='openDataStatic')
for table in tablewrapper.find_all('table'):
for link in table.find_all('a'):
fileurls.append(link['href'])
with zipfile.ZipFile("data.zip", "w") as code:
for url in fileurls:
print('Downloading: %s' % url)
f = urllib2.urlopen(url)
data = f.read()
xmlfilename = url.rsplit('/', 1)[-1]
code.writestr(xmlfilename, data)
You are doing nothing to encode this as zip file. If instead you choose to open it in a plain text editor such as notepad it should show you the raw xml.
I'm using python 2.7 and pycharm is my editor. What i'm trying to do is have python go to a site and download an image from that site and save it to my directory. Currently I have no errors but i don't think its downloading because the file is not showing in my directory.
import random
import urllib2
def download_web_image(url):
name = random.randrange(1,1000)
full_name = str(name) + ".jpg"
urllib2.Request(url, full_name)
download_web_image("www.example.com/page1/picture.jpg")
This will do the trick. The rest can stay the same, just edit your function to include the two lines I have added.
def download_web_image(url):
name = random.randrange(1,1000)
full_name = str(name) + ".jpg"
request = urllib2.Request(url)
img = urllib2.urlopen(request).read()
with open (full_name, 'w') as f: f.write(img)
Edit 1:
Exact code as requested in comments.
import urllib2
def download_web_image(url):
request = urllib2.Request(url)
img = urllib2.urlopen(request).read()
with open ('test.jpg', 'w') as f: f.write(img)
download_web_image("http://upload.wikimedia.org/wikipedia/commons/8/8c/JPEG_example_JPG_RIP_025.jpg")
You are simply creating a Request but you are not downloading the image. Try the following instead:
urllib.urlretrieve(url, os.path.join(os.getcwd(), full_name)) # download and save image
Or try the requests library:
import requests
image = requests.get("www.example.com/page1/picture.jpg")
with open('picture.jpg', 'wb') as f:
f.write(image.content)