I am trying to write a script to download images from Reddit using praw, saving the images to a folder of my choice, and exporting a .csv of the results.
I think I have coded it right since the images download, I am just getting an "Arrays must be the same length" error when I try to run the script.
I think this may have something to do with the "path" field in my dictionary, but the loops look like they're appending the information properly so I don't know. I am missing 2 entries from "path" I have no idea where they are dropping.
My code is below:
#! python3
import praw
import pandas as pd
import requests
path = r'C:\\Scripts\\IMG\\'
#Reddit API Tokens
reddit = praw.Reddit(client_id='x', \
client_secret='x', \
user_agent='x', \
username='x', \
password='x')
x_dict = {"id":[], \
"title":[], \
"url":[], \
"path":[]}
submissions = reddit.subreddit('x').hot(limit=100)
for submission in submissions:
x_dict["id"].append(submission.id)
x_dict["title"].append(submission.title)
x_dict["url"].append(submission.url)
if submission.url.endswith(".gifv"):
submission.url = submission.url.replace('.com/', '.com/download/')
submission.url = (submission.url + ".mp4")
r = requests.get(submission.url, allow_redirects=True)
if "gif" in r.headers['Content-Type']:
dir2 = os.path.join(path, submission.id + ".gif")
submission.url = (submission.url + ".gif")
open(dir2, 'wb').write(r.content)
print ("downloading " + submission.id + " to " + dir2)
x_dict["path"].append(dir2)
else:
dir2 = os.path.join(path, submission.id + ".mp4")
open(dir2, 'wb').write(r.content)
print ("downloading " + submission.id + " to " + dir2)
x_dict["path"].append(dir2)
elif "gfycat" in submission.url:
if "https://" in submission.url:
dir2 = os.path.join(path, submission.id + ".mp4")
submission.url = submission.url.replace('https://', 'https://giant.')
submission.url = (submission.url + ".mp4")
r = requests.get(submission.url, allow_redirects=True)
open(dir2, 'wb').write(r.content)
print ("downloading " + submission.id + " to " + dir2)
x_dict["path"].append(dir2)
else:
dir2 = os.path.join(path, submission.id + ".mp4")
submission.url = submission.url.replace('http://', 'http://giant.')
submission.url = (submission.url + ".mp4")
r = requests.get(submission.url, allow_redirects=True)
open(dir2, 'wb').write(r.content)
print ("downloading " + submission.id + " to " + dir2)
x_dict["path"].append(dir2)
elif "i.redd" in submission.url:
if submission.url.endswith(".jpg"):
dir2 = os.path.join(path, submission.id + ".jpg")
r = requests.get(submission.url, allow_redirects=True)
open(dir2, 'wb').write(r.content)
print ("downloading " + submission.id + " to " + dir2)
x_dict["path"].append(dir2)
elif submission.url.endswith(".jpeg"):
dir2 = os.path.join(path, submission.id + ".jpeg")
r = requests.get(submission.url, allow_redirects=True)
open(dir2, 'wb').write(r.content)
print ("downloading " + submission.id + " to " + dir2)
x_dict["path"].append(dir2)
elif submission.url.endswith(".png"):
dir2 = os.path.join(path, submission.id + ".png")
r = requests.get(submission.url, allow_redirects=True)
open(dir2, 'wb').write(r.content)
print ("downloading " + submission.id + " to " + dir2)
x_dict["path"].append(dir2)
elif "v.redd" in submission.url:
dir2 = os.path.join(path, submission.id + ".mp4")
r = requests.get(submission.media['reddit_video']['fallback_url'], allow_redirects=True)
open(dir2, 'wb').write(r.content)
print ("downloading " + submission.id + " to " + dir2)
x_dict["path"].append(dir2)
elif submission.url is None:
print ("\\ " + submission.id + " url is none")
x_dict["path"].append('')
else:
print ("\\" + submission.id + " not supported")
x_dict["path"].append('')
continue
print (len(x_dict["id"]))
print (len(x_dict["title"]))
print (len(x_dict["url"]))
print (len(x_dict["path"]))
x_data = pd.DataFrame(x_dict)
x_data.to_csv(os.path.join(path,'xscrape.csv'))
Output is as follows
downloading 99rdbf to C:\\Scripts\\IMG\\99rdbf.jpg
100
100
100
98
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-434-0d78dff7cb84> in <module>()
89 print (len(x_dict["url"]))
90 print (len(x_dict["path"]))
---> 91 x_data = pd.DataFrame(x_dict)
92 x_data.to_csv(os.path.join(path,'xscrape.csv'))
d:\Users\localuser\AppData\Local\Continuum\anaconda3\lib\site- packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
346 dtype=dtype, copy=copy)
347 elif isinstance(data, dict):
--> 348 mgr = self._init_dict(data, index, columns, dtype=dtype)
349 elif isinstance(data, ma.MaskedArray):
350 import numpy.ma.mrecords as mrecords
d:\Users\localuser\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in _init_dict(self, data, index, columns, dtype)
457 arrays = [data[k] for k in keys]
458
--> 459 return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
460
461 def _init_ndarray(self, values, index, columns, dtype=None, copy=False):
d:\Users\localuser\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
7313 # figure out the index, if necessary
7314 if index is None:
-> 7315 index = extract_index(arrays)
7316
7317 # don't force copy because getting jammed in an ndarray anyway
d:\Users\localuser\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in extract_index(data)
7359 lengths = list(set(raw_lengths))
7360 if len(lengths) > 1:
-> 7361 raise ValueError('arrays must all be same length')
7362
7363 if have_dicts:
ValueError: arrays must all be same length
The core problem here is your data structure design: it makes it easy to fall into programming errors rather than helping to prevent them.
In this answer I'm going to use a standard programmer trick: I'm not even going to try to figure out what the problem is in the current code, but simply restructure things so that problem can no longer appear.
In a CSV file each line is a sequence of closely related items. In turn, the whole file is a sequence of these lines. You want to keep the more closely related items closer together in the data structure, so your "inside" data structure of the two lists should be a sequence of the fields in a single line, and the "outside" data structure should be a sequence of the lines, which is the opposite of what you've done.
In Python there are two very common sequence data structures: list, which you already know about and are using here, and tuple which is similar to list but immutable.
For this program it's worth learning and understanding the namedtuple data structure, which is a tuple but extended with field names and a constructor that will ensure you're always using the same number of arguments. The latter is yet another data structure design decision that will help you avoid programming errors.
Define your data structure for a CSV line as follows:
from collections import namedtuple
Download = namedtuple('Download', 'id title url path')
(It's worth typing this directly into a Python interpreter (python -i or ipython) and playing around with it a bit until you get comfortable with creating and showing named tuples.)
You can then build a list of these as you do your downloads. Since a tuple is immutable we need to build it a single call to the constructor, we can create it only after we have all the information we need to do so. Then we add it to the list.
def download(id, url):
# All the stuff you need to do an individual download here.
return path
downloads = []
for s in submissions:
path = download(s.id, s.url)
dl = Download.new(s.id, s.title, s.url, path)
downloads.append(dl)
You don't need to install Pandas to write CSV files; there's a csv module in the standard library that does a fine job. Working from an example in its documentation:
import csv
with open(os.path.join(path,'xscrape.csv'), 'w', newline='') as out:
writer = csv.writer(out)
writer.writerows(downloads)
(This produces a CSV file without a header line; adding one I leave as an exercise for the reader.)
Related
Good day.
I wrote a little Python program to help me easily create .cbc files for Calibre, which is just a renamed .zip file with a text file called comics.txt for TOC purposes. Each chapter is another zip file.
The issue is that the last zip file zipped always has the error "Unexpected end of data". The file itself is not corrupt, if I unzip it and rezip it it works perfectly. Playing around it seems that the problem is that Python doesn't close the last zip file after zipping it, since I can't delete the last zip while the program is still running since it's still open in Python. Needless to say, Calibre doesn't like the file and fails to convert it unless I manually rezip the affected chapters.
The code is as follows, checking the folders for not-image files, zipping the folders, zipping the zips while creating the text file, and "changing" extension.
import re, glob, os, zipfile, shutil, pathlib, gzip, itertools
Folders = glob.glob("*/")
items = len(Folders)
cn_list = []
cn_list_filtered = []
dirs_filtered = []
ch_id = ["c", "Ch. "]
subdir_im = []
total = 0
Dirs = next(os.walk('.'))[1]
for i in range(0, len(Dirs)):
for items in os.listdir("./" + Dirs[i]):
if items.__contains__('.png') or items.__contains__('.jpg'):
total+=1
else:
print(items + " not an accepted format.")
subdir_im.append(total)
total = 0
for fname in Folders:
if re.search(ch_id[0] + r'\d+' + r'[\S]' + r'\d+', fname):
cn = re.findall(ch_id[0] + "(\d+[\S]\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[0] + r'\d+', fname):
cn = re.findall(ch_id[0] + "(\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[1] + r'\d+' + '[\S]' + r'\d+', fname):
cn = re.findall(ch_id[1] + "(\d+[\S]\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[1] + r'\d+', fname):
cn = re.findall(ch_id[1] + "(\d+)", fname)[0]
cn_list.append(cn)
else:
print('Warning: File found without proper filename format.')
cn_list_filtered = set(cn_list)
cn_list_filtered = sorted(cn_list_filtered)
cwd = os.getcwd()
Dirs = Folders
subdir_zi = []
total = 0
for i in range(0, len(cn_list_filtered)):
for folders in Dirs:
if folders.__contains__(ch_id[0] + cn_list_filtered[i] + " ")\
or folders.__contains__(ch_id[1] + cn_list_filtered[i] + " "):
print('Zipping folder ', folders)
namezip = "Chapter " + cn_list_filtered[i] + ".zip"
current_zip = zipfile.ZipFile(namezip, "a")
for items in os.listdir(folders):
if items.__contains__('.png') or items.__contains__('.jpg'):
current_zip.write(folders + "/" + items, items)
total+=1
subdir_zi.append(total)
total = 0
print('Folder contents in order:', subdir_im, ' Total:', sum(subdir_im))
print("Number of items per zip: ", subdir_zi, ' Total:', sum(subdir_zi))
if subdir_im == subdir_zi:
print("All items in folders have been successfully zipped")
else:
print("Warning: File count in folders and zips do not match. Please check the affected chapters")
zips = glob.glob("*.zip")
namezip2 = os.path.basename(os.getcwd()) + ".zip"
zipfinal = zipfile.ZipFile(namezip2, "a")
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
Data = []
for i in range (0,len(cn_list_filtered),1):
Datai = ("Chapter " + cn_list_filtered[i] + ".zip" + ":Chapter " + cn_list_filtered[i] + "\r\n")
Data.append(Datai)
Dataok = ''.join(Data)
with zipfile.ZipFile(namezip2, 'a') as myzip:
myzip.writestr("comics.txt", Dataok)
zipfinal.close()
os.rename(namezip2, namezip2 + ".cbc")
os.system("pause")
I am by no means a programmer, that is just a Frankenstein monster code I eventually managed to put together by checking threads, but this last issue has me stumped.
Some solutions I tried are:
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
zips[i].close()
Fails with:
zips[i].close()
AttributeError: 'str' object has no attribute 'close'
and:
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
zips[len(zips)].close()
Fails with:
zips[len(zips)].close()
IndexError: list index out of range
Thanks for the help.
This solved my issue:
def generate_zip(file_list, file_name=None):
zip_buffer = io.BytesIO()
zf = zipfile.ZipFile(zip_buffer, mode="w", compression=zipfile.ZIP_DEFLATED)
for file in file_list:
print(f"Filename: {file[0]}\nData: {file[1]}")
zf.writestr(file[0], file[1])
**zf.close()**
with open(file_name, 'wb') as f:
f.write(zip_buffer.getvalue())
f.close()
I have more than 500 xml files and each xml file should processed on FME workbench individually (iteration of FME workbench for each xml file).
For such a propose i have to run a python file (loop.py) to iterate FME workbench for each xml file.
The whole process was working in past on other PC without any problem. Now Once i run Module i got the following error:
Traceback (most recent call last):E:\XML_Data
File "E:\XML_Data\process\01_XML_Tile_1.py", line 28, in
if "Translation was SUCCESSFUL" in open(path_log + "\" + data + ".log").read():
IOError: [Errno 2] No such file or directory: 'E:\XML_Data\data_out\log_01\re_3385-5275.xml.log'
Attached the python code(loop.py).
Any help is greatly appreciated.
import os
import time
# Mainpath and Working Folder:
#path_main = r"E:\XML_Data"
path_main = r"E:\XML_Data"
teil = str("01")
# variables
path_in = path_main + r"\data_in\03_Places\teil_" + teil # "Source folder of XML files"
path_in_tile10 = path_main + r"\data_in\01_Tiling\10x10.shp" # "Source folder of Grid shapefile"
path_in_commu = path_main + r"\data_in\02_Communities\Communities.shp" # "Source folder of Communities shapefile"
path_out = path_main + r"\data_out\teil_" + teil # "Output folder of shapefiles that resulted from XML files (tile_01 folder)"
path_log = path_main + r"\data_out\log_" + teil # "Output folder of log files for each run(log_01 folder)"
path_fme = r"%FME_EXE_2015%" # "C:\Program Files\FME2015\fme.exe"
path_fme_workbench = path_main + r"\process\PY_FME2015.fmw" # "path of FME workbench"
datalists = os.listdir(path_in)
count = 0
# loop each file individually in FME
for data in datalists:
if data.find(".xml") != -1:
count +=1
print ("Run-No." + str(count) + ": with data " + data)
os.system (path_fme + " " + path_fme_workbench + " " + "--SourceDataset_XML"+ " " + path_in + "\\" + data + " " + "--SourceDataset_SHAPE" + " " + path_in_tile10 + " " + "--SourceDataset_SHAPE_COMU" + " " + path_in_commu + " " + "--DestDataset_SHAPE" +" " +path_out + " " +"LOG_FILENAME" + " " + path_log + "\\" + data + ".log" )
print ("Data processed: " + data)
shape = str(data[19:28]) + "_POPINT_CENTR_UTM32N.shp"
print ("ResultsFileName: " + shape)
if "Translation was SUCCESSFUL" in open(path_log + "\\" + data + ".log").read():
# Translation was successful and SHP file exists:
if os.path.isfile(path_out + "\\" + shape):
write_log = open(path_out + "\\" + "result_xml.log", "a")
write_log.write(time.asctime(time.localtime()) + " " + shape + "\n")
write_log.close()
print("Everything ok")
#Translation was successful, but SHP file does not exist:
else:
write_log = open(path_out + "\\" + "error_xml.log", "a")
write_log.write(time.asctime(time.localtime()) + " Data: " + shape + " unavailable.\n")
write_log.close()
# Translation was not successful:
else:
write_log = open(path_out + "\\" + "error_xml.log", "a")
write_log.write(time.asctime(time.localtime()) + " Translation " + Data + " not successful.\n")
write_log.close()
print ("Number of calculated files: " + str(count))
Most likely, the script failed at the os.system line, so the log file was not created from the command. Since you mentioned a different computer, it could be caused by many reasons, such as a different version of FME (so the environment variable %FME_EXE_2015% would not exist).
Use a workspace runner transformer to do this.
The FME version is outdated.so first check the version whether it is creating the problem.
subprocess.call(["C:/Program Files/fme/FMEStarter/FMEStarter.exe", "C:/Program Files/fme/fme20238/fme.exe", "/fmefile.fmw" "LOG_FILENAME","logfile"], stdin=None, stdout=None, stderr=None, shell=True, timeout=None)
I'm assuming this has to be a memory issue but I'm not sure. The program loops through PDF's to look for corrupted files. When a file is corrupted, it writes that location to a txt file for me to review later. When running it the first time, I logged both pass and fail scenarios to the log. After 67381 log entries, it stopped. Then I changed this logic so it only logs errors, however, in the console I did display a count of the loop so I can tell how far along the process is. There are about 190k files to loop through and at exactly 67381 the count stops every time. It looks like the python program is still running in the background as the memory and cpu keeps fluctuating but it's hard to be sure. I also don't know now if it will still write errors to the log.
Here is the code,
import PyPDF2, os
from time import gmtime,strftime
path = raw_input("Enter folder path of PDF files:")
t = open(r'c:\pdf_check\log.txt','w')
count = 1
for dirpath,dnames,fnames in os.walk(path):
for file in fnames:
print count
count = count + 1
if file.endswith(".pdf"):
file = os.path.join(dirpath, file)
try:
PyPDF2.PdfFileReader(open(file, "rb"))
except PyPDF2.utils.PdfReadError:
curdate = strftime("%Y-%m-%d %H:%M:%S", gmtime())
t.write (str(curdate) + " " + "-" + " " + file + " " + "-" + " " + "fail" + "\n")
else:
pass
#curdate = strftime("%Y-%m-%d %H:%M:%S", gmtime())
#t.write(str(curdate) + " " + "-" + " " + file + " " + "-" + " " + "pass" + "\n")
t.close()
Edit 1: (New Code)
New code and the same issue:
import PyPDF2, os
from time import gmtime,strftime
path = raw_input("Enter folder path of PDF files:")
t = open(r'c:\pdf_check\log.txt','w')
count = 1
for dirpath,dnames,fnames in os.walk(path):
for file in fnames:
print count
count = count + 1
if file.endswith(".pdf"):
file = os.path.join(dirpath, file)
try:
with open(file,'rb') as f:
PyPDF2.PdfFileReader(f)
except PyPDF2.utils.PdfReadError:
curdate = strftime("%Y-%m-%d %H:%M:%S", gmtime())
t.write (str(curdate) + " " + "-" + " " + file + " " + "-" + " " + "fail" + "\n")
f.close()
else:
pass
f.close()
#curdate = strftime("%Y-%m-%d %H:%M:%S", gmtime())
#t.write(str(curdate) + " " + "-" + " " + file + " " + "-" + " " + "pass" + "\n")
t.close()
Edit 2: I am trying to now run this from a different machine with beefier hardware and a different version of windows (10 pro instead of server 2008 r2) but I don't think this is the issue.
Try to edit one of the .pdf files to make it larger. That way, if the loop number your program "stops" at is smaller, you can identify the problem as a memory issue.
Else, it might be an unusually large pdf file that is taking your program a while to verify integrity.
Debugging this, you could print the file location of the .pdf files you open to find this particular .pdf and manually open it to investigate further..
Figured it out. The issue is actually due to a random and very large corrupted PDF. So this is not a loop issue, it's a corrupted file issue.
I am trying to parse multiple files dealing with "Mike's Pies" as you can see in the code below. I have written it to where I get the desired output, now I would like to parse all the files named "Mike's Pies"
import json
import sys
import glob
with open("Mike's Pies.20130201.json") as json_data:
data = json.load(json_data)
#Keep all orders with variable of r
for r in data ["orders"]:
orderName = r["orderPlacer"]["name"]
#Print with address to acquire the housenumber/street/city/state
address = r["address"]["houseNumber"]
street = r["address"]["street"]
city = r["address"]["city"]
state = r["address"]["state"]
Mikes = "Mike's Pies,"
output = str(orderName) + ", " + str(address) + " " + str(street) +
" " + str(city) + " " + str(state) + ", " + Mikes + " "
length = len(r["pizzas"])
for i in range (0,length):
#if length >= 1 print r["pizzas"][1]["name"]
#if i!=length:
pizza = ((r["pizzas"][i]["name"].strip("\n"))).strip(" ")
if(i!=length-1):
output += pizza + ", "
else:
output += pizza
print(output+"\n")
It sounds like you have code which works on "Mike's Pies.20130201.json", and you want to run that code on every file that starts with "Mike's Pies" and ends with "json", regardless of the timestamp-like bit in the middle. Am I right? You can get all matching filenames with glob and parse them one after the other.
for filename in glob.glob("Mike's Pies.*.json"):
with open(filename) as json_data:
data = json.load(json_data)
#etc etc... Insert rest of code here
I have many subdirectories in my main directory and would like to write a script to unzip and convert all the files within it. If possible, I would also like to combine all the CSV within a single directory into a single CSV. But more importantly, I need help with my nested loop.
import gzip
import csv
import os
subdirlist = os.listdir('/home/user/Desktop/testloop')
subtotal = len(subdirlist)
subcounter = 0
for dirlist in subdirlist:
print "Working On " + dirlist
total = len(dirlist)
counter = 0
for dir in dirlist:
print "Working On " + dir
f = gzip.open('/' + str(subdirlist) + '/' + dir, 'rb')
file_content = f.read()
f.close()
print "25% Complete"
filename = '/' + str(subdirlist) + '/temp.txt'
target = open(filename, 'w')
target.write(file_content)
target.close()
print "50% Complete!"
csv_file = '/' + str(subdirlist) + '/' + str(dir) + '.csv'
in_txt = csv.reader(open(filename, "rb"), delimiter = '\t')
out_csv = csv.writer(open(csv_file, 'wb'))
out_csv.writerows(in_txt)
os.remove(filename)
os.remove('/' + str(subdirlist) + '/' + dir)
counter+=1
print str(counter) + "/" + str(total) + " " + str(dir) + " Complete!"
print "SubDirectory Converted!"
print str(subcounter) + "/" + str(subtotal) + " " + str(subdirlist) + " Complete!"
subcounter+=1
print "All Files Converted!"
Thanks in advance
To get lists of files and subdirectories, you can use os.walk. Below is an implementation I wrote to get all files (optionally, of certain type(s)) in arbitrarily nested subdirectories:
from os import walk, sep
from functools import reduce # in Python 3.x only
def get_filelist(root, extensions=None):
"""Return a list of files (path and name) within a supplied root directory.
To filter by extension(s), provide a list of strings, e.g.
get_filelist(root, ["zip", "csv"])
"""
return reduce(lambda x, y: x+y,
[[sep.join([item[0], name]) for name in item[2]
if (extensions is None or
name.split(".")[-1] in extensions)]
for item in walk(root)])