I'm trying to download an image from a website but I get a 404 error. I tried to add a user agent with no sucess.
Here is the code:
import requests
import shutil
with open(r'C:\Users\home\Desktop\urls.csv') as file:
csv = []
for row in file:
csv.append(row.split(";"))
row = 0
while row < len(csv):
r = requests.get(csv[row][0], stream=True, headers={'User-agent': 'Mozilla/5.0'})
if r.status_code == 200:
with open(r"C:\Users\home\Desktop\images\house" + str(row) + ".jpg", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
row +=1
The url is:
https://example.com/wp-content/uploads/2018/10/toronto-curbed-8.jpg
replace example by cdn.icepop
Related
# Parse HTTP headers
headers = request.split('\n')
filename = headers[0].split()[1]
# Get the content of the file
if filename == '/':
filename = '/index.html'
# Get the content of htdocs/index.html
try:
fin = open(filename)
content = fin.read()
fin.close()
#Error, not found
except FileNotFoundError:
response = 'HTTP/1.0 404 NOT FOUND\n\nFile Not Found'
unable to get this to work below any ideas?? Unsure what exactly could be wrong here
I ran the codes below. Most of the codes work, but when I ran the "for elm in collect" block, I got an error: HTTPError: HTTP Error 403: Forbidden. Can anyone help with this? Thanks!!
import requests
from bs4 import BeautifulSoup
import urllib.request
import os
resp = requests.get('https://www.williams.edu/institutional-research/common-data-set/',
headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(resp.text, 'html5lib')
links = [a['href'] for a in soup.select('li a[href]')]
collect = []
for link in links:
if "https://www.williams.edu/institutional-research/files/" in link:
collect.append(link)
for elm in collect:
def main():
download_file(elm) # the elm is an url.
def download_file(download_url): # the download_url is the elm.
save_path = 'C:/Users/WM'
file_name = elm.split("/")[-1]
complete_name = os.path.join(save_path, file_name)
response = urllib.request.urlopen(download_url)
file = open(complete_name, 'wb')
file.write(response.read())
file.close()
print("Completed")
if __name__ == "__main__":
main()
Not sure why there is a mixed use of requests and urllib in your code - Just request the download_url in loop as you do with initial url and add some header:
response = requests.get(download_url, headers={'User-Agent': 'Mozilla/5.0'})
Example
import requests
from bs4 import BeautifulSoup
import os
resp = requests.get('https://www.williams.edu/institutional-research/common-data-set/',
headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(resp.text, 'html5lib')
links = [a['href'] for a in soup.select('li a[href]')]
collect = []
for link in links:
if "https://www.williams.edu/institutional-research/files/" in link:
collect.append(link)
for elm in collect:
def main():
download_file(elm) # the elm is an url.
def download_file(download_url): # the download_url is the elm.
save_path = 'C:/Users/WM'
file_name = elm.split("/")[-1]
complete_name = os.path.join(save_path, file_name)
response = requests.get(download_url, headers={'User-Agent': 'Mozilla/5.0'})
file = open(complete_name, 'wb')
file.write(response.read())
file.close()
print("Completed")
if __name__ == "__main__":
main()
I have an excel list of DOIs of papers I'm interested in. Based on this list, I would like to download all the papers.
I tried to do it with request, as recommended in their documentation. But the pdf files I get are damaged. They are just some KB big. I changed the chunk_size several times from None till 1024*1024 and I have read many posts already. Nothing helps.
Please, what are your ideas?
import pandas as pd
import os
import requests
def get_pdf(doi, file_to_save_to):
url = 'http://api.elsevier.com/content/article/doi:'+doi+'?view=FULL'
headers = {
'X-ELS-APIKEY': "keykeykeykeykeykey",
'Accept': 'application/pdf'
}
r = requests.get(url, stream=True, headers=headers)
if r.status_code == 200:
for chunk in r.iter_content(chunk_size=1024*1024):
file_to_save_to.write(chunk)
return True
doi_list = pd.read_excel('list.xls')
doi_list.columns = ['DOIs']
count = 0
for doi in doi_list['DOIs']:
doi = doi.replace('DOI:','')
pdf = doi.replace('/','%')
if not os.path.exists(f'path/{pdf}.pdf'):
file = open(f'path/{pdf}.pdf', 'wb')
get_pdf(doi, file)
count += 1
print(f"Dowloaded: {count} of {len(doi_list['DOIs'])} articles")
I think your problem is the return True in for chunk in r.iter_content. With that line, you'll only ever write one chunk of the PDF of size chunk_size.
You should also open files using with; as is, you'll never close the file handles.
import pandas as pd
import os
import requests
HEADERS = {
'X-ELS-APIKEY': "keykeykeykeykeykey",
'Accept': 'application/pdf'
}
def get_pdf(doi, file_to_save_to):
url = f'http://api.elsevier.com/content/article/doi:{doi}?view=FULL'
with requests.get(url, stream=True, headers=HEADERS) as r:
if r.status_code == 200:
for chunk in r.iter_content(chunk_size=1024*1024):
file_to_save_to.write(chunk)
doi_list = pd.read_excel('list.xls')
doi_list.columns = ['DOIs']
count = 0
for doi in doi_list['DOIs']:
doi = doi.replace('DOI:','')
pdf = doi.replace('/','%')
if not os.path.exists(f'path/{pdf}.pdf'):
with open(f'path/{pdf}.pdf', 'wb') as file:
get_pdf(doi, file)
count += 1
print(f"Dowloaded: {count} of {len(doi_list['DOIs'])} articles")
I am trying to make a copy of smart sheet data on my local disk. I am able to copy all the smart sheet data except for the cell images. Below is the code am using. This code works perfectly fine to copy the data but not the cell images
NOTE: I am not trying to copy the attachments from smart sheets; only the cell the images and data.
Could someone help me to enhance this code to copy the cell images as well?
import json
import os
import requests
import time
token = "Bearer <TOken>"
backed_up_sheets = {"Attach": 86960044478894,"test2":6659760455684}
dir = r'C:\Users\\me\SmartSheetsBackup\WorkSheet' + time.strftime("-%m_%d_%Y_%H_%M")
API_URL = "https://api.smartsheet.com/2.0/sheets/"
payload = {"Authorization": token,
"Accept": "application/vnd.ms-excel,image/*"}
amount = len(backed_up_sheets)
i = 1
for el in backed_up_sheets:
r = requests.get(API_URL + str(backed_up_sheets[el]) , headers=payload)
if r.status_code != 200:
print ('Some problem with connections please retry later0')
pass
if not os.path.exists(dir):
os.makedirs(dir)
with open(dir + el + time.strftime("-%m_%d_%Y_%H_%M") + ".xlsx", 'wb') as output:
output.write(r.content)
print ('Progress in sheets: ' + str(i) + '/' + str(amount))
i += 1
Here's a complete code sample:
# Download an image in a cell
def download_cell_image(client, sheet_id, row_id, column_id, default_filename):
# Get desired row
row = client.Sheets.get_row(sheet_id, row_id)
cell = row.get_column(column_id)
image = cell.image
filename = getattr(image, 'alt_text', default_filename)
# Obtain a temporary image URL
imageUrl = ss_client.models.ImageUrl( { "imageId": image.id } )
response = ss_client.Images.get_image_urls([imageUrl])
url = response.image_urls[0].url
# Download the image
import requests
response = requests.get(url)
if response.status_code == 200:
with open(filename, 'wb') as f:
f.write(response.content)
Note that this requires SDK version 1.3.0 or later
The same steps illustrated in the cURL example should work in Python. (Apologies that we don't have an complete published sample)
Get the image id from the cell object, as returned from get_sheet
Convert the image id to a download url, using images.get_image_urls (docs)
Download the image from the url, probably using the Requests library.
I use this code to download jpg file without any problem. But as you can see the following page source include a lot of image with path blank.gif.
<a href="/en/chowchow-puppy-sleeping-dogs-pet-448311/"><img src="/static/img/blank.gif"
My question: Is it possible to add a detect function when it is blank.gif then auto download image file with 640*426 from "https://pixabay.com/en/chowchow-puppy-sleeping-dogs-pet-448311/" and how to archive ??
import random
import requests
from bs4 import BeautifulSoup
# got from http://stackoverflow.com/a/16696317
def download_file(url):
local_filename = url.split('/')[-1]
print("Downloading {} ---> {}".format(url, local_filename))
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
return local_filename
def Download_Image_from_Web(url):
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('img'):
image_links = link.get('src')
if not image_links.startswith('http'):
image_links = url + '/' + image_links
download_file(image_links)
Download_Image_from_Web("https://pixabay.com/en/photos/?q=sleeping+puppy&hp=&image_type=&cat=&min_width=&min_height=")
Updated version. Read comments for additional info.
import random
import requests
from bs4 import BeautifulSoup
# got from http://stackoverflow.com/a/16696317
def download_file(url):
local_filename = url.split('/')[-1]
print("Downloading {} ---> {}".format(url, local_filename))
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
return local_filename
def Download_Image_from_Web(url):
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('img'):
image_links = link.get('src')
if image_links.endswith('blank.gif'):
image_links = link.get('data-lazy')
if not image_links.startswith('http'):
image_links = url + '/' + image_links
download_file(image_links)
Download_Image_from_Web("https://pixabay.com/en/photos/?q=sleeping+puppy&hp=&image_type=&cat=&min_width=&min_height=")