I'm new to python and really struggling with writing a program to download a CSV from this page
So far I have:
import csv
import requests
import os
output_dir = 'C:/Users/Moshe/Downloads'
output_file = 'Covid_19_uk_timeseries.csv'
CSV_URL = 'https://coronavirus.data.gov.uk/downloads/csv/coronavirus-deaths_latest.csv'
assert os.path.exists(output_dir)# test that we can write to output_dir
with requests.Session() as s:
download = s.get(CSV_URL)
decoded_content = download.content.decode('utf-8')
cr = csv.reader(decoded_content.splitlines(), delimiter=',')
my_list = list(cr)
for row in my_list:
print(row)
Its output show that it gets the CSV fine, and has access to the output directory. But for now I just want to save it as it is, and can't seem to work out how to do that.
def main():
import requests
url = "https://coronavirus.data.gov.uk/downloads/csv/coronavirus-deaths_latest.csv"
with requests.get(url, stream=True) as response:
response.raise_for_status()
with open("deaths_latest.csv", "wb") as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
file.flush()
print("Done")
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
You can use
csv.writerows()
to write the csv file. Check this site for more. https://www.programiz.com/python-programming/writing-csv-files
Related
Hi I have created a piece of code that downloads data from a api end point and also loads in the apikeys.
I am trying to achieve downloading the api data into csv files into their own folder based on the input.csv I have tried to achieve this by adding the following section at the end. The problem is that it does not download the file its to be receiving from the api end point.
Please assist?
with open('filepath/newfile.csv', 'w+') as f:
f.write(r.text)
import csv
import sys
import requests
def query_api(business_id, api_key):
headers = {
"Authorization": api_key
}
r = requests.get('https://api.link.com', headers=headers)
print(r.text)
# get filename from command line arguments
if len(sys.argv) < 2:
print ("input.csv")
sys.exit(1)
csv_filename = sys.argv[1]
with open(csv_filename) as csv_file:
csv_reader = csv.DictReader(csv_file, delimiter=',')
for row in csv_reader:
business_id = row['BusinessId']
api_key = row['ApiKey']
query_api(business_id, api_key)
with open('filepath/newfile.csv', 'w+') as f:
f.write(r.text)
I'm having trouble organizing my CSV file full of urls and downloading each image per url.
https://i.imgur.com/w1slgf6.png
It's quite hell, but the goal is to:
Write the src of these images into a csv file, splitting each url per line.
And download each image
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import urllib.request
import pandas as pd
import requests
import urllib
import csv
# BeautifulSoup4 findAll src from img
print ('Downloading URLs to file')
sleep(1)
with open('output.csv', 'w', newline='\n', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(srcs)
print ('Downloading images to folder')
sleep(1)
filename = "output"
with open("{0}.csv".format(filename), 'r') as csvfile:
# iterate on all lines
i = 0
for line in csvfile:
splitted_line = line.split(',')
# check if we have an image URL
if splitted_line[1] != '' and splitted_line[1] != "\n":
urllib.request.urlretrieve(splitted_line[1], "img_" + str(i) + ".png")
print ("Image saved for {0}".format(splitted_line[0]))
i += 1
else:
print ("No result for {0}".format(splitted_line[0]))
Base on the limited resources that you provided, I think this is the code that you need:
import requests
with open('output.csv', 'r') as file:
oldfile = file.read()
linkslist = oldfile.replace("\n", "") # Because your file is wrongly splitted by new lines so I removed it
links = linkslist.split(",")
with open('new.csv', 'w') as file: # Writing all your links to a new file, this can combine with the below code but I think open file and requests at the same time will make it slower
for link in links:
file.write(link + "\n")
for link in links:
response = requests.get(link) # This is to save image
file = open("(yourfilenamehere).png", "wb") # Replace the name that you want for the picture in here
file.write(response.content)
file.close()
Please find comments of explanation inside the code, If you have any problem, just ask, I haven't tested it because I don't have your exact CSV but it should work
Currently im facing following problem:
I have 3 download links in a list. Only the last file in the list is downloaded completely.
The others have a file size of one kilobyte.
Code:
from requests import get
def download(url, filename):
with open(filename, "wb") as file:
response = get(url, stream=True)
file.write(response.content)
for link in f:
url = link
split_url = url.split("/")
filename = split_url[-1]
filename = filename.replace("\n", "")
download(url,filename)
The result looks like this:
Result
How do I make sure that all files are downloaded correctly?
All links are direct download links.
Thanks in advance!
EDIT:
I discovered it only happens when I read the links from the .txt
If I create the list in python like this:
links = ["http://ipv4.download.thinkbroadband.com/20MB.zip",
"http://ipv4.download.thinkbroadband.com/10MB.zip",
"http://ipv4.download.thinkbroadband.com/5MB.zip"]
... the problem doesnt appear.
reproduceable example:
from requests import get
def download(url, filename):
with open(filename, "wb") as file:
response = get(url, stream = True)
file.write(response.content)
f = open('links.txt','r')
for link in f:
url = link
split_url = url.split("/")
filename = split_url[-1]
filename = filename.replace("\n", "")
download(url,filename)
content of links.txt:
http://ipv4.download.thinkbroadband.com/20MB.zip
http://ipv4.download.thinkbroadband.com/10MB.zip
http://ipv4.download.thinkbroadband.com/5MB.zip
url = url.replace("\n", "")
solved it!
I am new in python so i have the specific json and i have extracted the
documentElement value from the dictionary and then made it a list...How to download this pdfs automatically to a directory folder?
import urllib.request
import requests
import json
url = 'https://diavgeia.gov.gr/luminapi/api/search/export?q=decisionType:%22%CE%93%CE%9D%CE%A9%CE%9C%CE%9F%CE%94%CE%9F%CE%A4%CE%97%CE%A3%CE%97%22&OrganizationUid:%2250024%22&status:%22%CE%91%CE%BD%CE%B1%CF%81%CF%84%CE%B7%CE%BC%CE%AD%CE%BD%CE%B7%22&page=1&size=4&wt=json'
#get urls
response = requests.get(url)
with urllib.request.urlopen(url) as u:
data = json.loads(u.read().decode())
#add links to the list
pdf_links = list()
for key in data:
for x in data[key]:
pdf_links.append(x['documentUrl'])
#print
print(pdf_links)
Here we go:
import requests
response = requests.get('https://diavgeia.gov.gr/luminapi/api/search/export?q=decisionType:%22%CE%93%CE%9D%CE%A9%CE%9C%CE%9F%CE%94%CE%9F%CE%A4%CE%97%CE%A3%CE%97%22&OrganizationUid:%2250024%22&status:%22%CE%91%CE%BD%CE%B1%CF%81%CF%84%CE%B7%CE%BC%CE%AD%CE%BD%CE%B7%22&page=1&size=4&wt=json')
for doc in response.json()['decisionResultList']:
r = requests.get(doc['documentUrl'], stream=True)
with open('{}.pdf'.format(doc['ada']), 'wb') as f:
for chunk in r:
f.write(chunk)
Following files has been downloaded to my PC:
So I am pulling jpg's from a url. I am able to save the image files as long as they are being saved to the same folder the python file is in. As soon as I attempt to change the folder(seen here as the outpath) the image files do not get created. I imagine it has something to do with my outpath, but it seems to be fine when I am printing and watching it in the console.
Ubuntu 11.10 OS by the way. I'm a newbie with both linux and python, so it could easily be either. :)
If I were to print the sequence taken from the CSV file it would look like: [['Champ1', 'Subname1', 'imgurl1'],['Champ2', 'subname2', 'imgurl2'],['Champ3','subname3','imgurl3']...]
(It was scraped from a website)
import csv
from urlparse import urlsplit
from urllib2 import urlopen, build_opener
from urllib import urlretrieve
import webbrowser
import os
import sys
reader = csv.reader(open('champdata.csv', "rb"), delimiter = ",", skipinitialspace=True)
champInfo = []
for champs in reader:
champInfo.append(champs)
size = len(champInfo)
def GetImages(x, out_folder="/home/sean/Home/workspace/CP/images"):
b=1
size = len(champInfo)
print size
while b < size:
temp_imgurls = x.pop(b)
filename = os.path.basename(temp_imgurls[2])
print filename
outpath = os.path.join(out_folder, filename)
print outpath
u = urlopen(temp_imgurls[2])
localFile = open(outpath, 'wb')
localFile.write(u.read())
localFile.close()
b+=1
GetImages(champInfo)
I understand it's quite crude, but it does work, only if I'm not attempting to change the save path.
Try providing the complete image path everywhere
E:/../home/sean/Home/workspace/CD/images
def GetImages(x):
b=1
size = len(champInfo)
print size
while b < size:
temp_imgurls = x.pop(b)
filename = temp_imgurls[2]
u = urlopen(temp_imgurls[2])
localFile = open(filename, 'wb')
localFile.write(u.read())
localFile.close()
And this code will be save files in the same directory where script is.
Updated Answer:
I think the answer to your problem is just to add a check for the output directory existence, and create it if needed. ie, add:
if not os.path.exists(out_folder):
os.makedirs(out_folder)
to your existing code.
More generally , you could try something more like this:
import csv
from urllib2 import urlopen
import os
import sys
default_outfolder = "/home/sean/Home/workspace/CD/images"
# simple arg passing wihtout error checking
out_folder = sys.argv[1] if len(sys.argv) == 2 else default_outfolder
if not os.path.exists(out_folder):
os.makedirs(out_folder) # creates out_folder, including any required parent ones
else:
if not os.path.isdir(out_folder):
raise RuntimeError('output path must be a directory')
reader = csv.reader(open('champdata.csv', "rb"), delimiter = ",", skipinitialspace=True)
for champs in reader:
img_url = champs[2]
filename = os.path.basename(img_url)
outpath = os.path.join(out_folder, filename)
print 'Downloading %s to %s' % (img_url, outpath)
with open(outpath, 'wb') as f:
u = urlopen(img_url)
f.write(u.read())
The above code works for champdata.csv of the form stuff,more_stuff,http://www.somesite.com.au/path/to/image.png
but will need to be adapted if I have not understood the actual format of your incoming data.