I have the following Python script that reads the image urls from a text file and then downloads the images and saves it in the same folder. The images are downloaded file but for some reason
# this script is used to download the images using the provided url
import requests
import ntpath
# save image data
def save_image_data(image_data,file_name):
with open(file_name,'wb') as file_object:
file_object.write(image_data)
# read the images_url file
with open('images_urls_small.txt') as file_object:
for line in file_object:
file_name = ntpath.basename(line)
print(file_name)
# download the image
try:
image_data = requests.get(line).content
except:
print("error download an image")
# save the image
save_image_data(image_data,file_name)
The images are downloaded fine but for reason it ends up with ? after their file name as shown in the screenshot below.
What am I missing?
You taking the filenames from a file:
for line in file_object:
file_name = ntpath.basename(line)
but those lines will still have the line separator (a newline character, so \n) inscluded. Strip your lines:
for line in file_object:
file_name = ntpath.basename(line.strip())
Related
I have the following code (comments explain what is occuring):
import os
from io import StringIO
from PyPDF2 import PdfFileReader
# Path to the directory containing the PDF files
pdf_dir = '/path/to/pdf/files'
# Iterate over the files in the directory
for filename in os.listdir(pdf_dir):
# Check if the file is a PDF file
if filename.endswith('.pdf'):
# Construct the full path to the file
filepath = os.path.join(pdf_dir, filename)
# Open the PDF file and read its contents
with open(filepath, 'rb') as f:
pdf = PdfFileReader(f)
# Extract the text from the PDF file
text = ''
for page in pdf.pages:
text += page.extractText()
# Construct the name of the output text file
txt_filename = filename[:-4] + '.txt'
# Write the text to the output file
with open(txt_filename, 'w') as f:
f.write(text)
When I run the code, it produces a Xref table not zero-indexed. ID numbers for objects will be corrected warning. It is not a hard error, but it makes me wonder if there's a different way I should be doing this.
Thanks for any suggestions.
I want to extract text from multiple text files and the idea is that i have a folder and all text files are there in that folder.
I have tried and succesfully get the text but the thing is that when i use that string buffer somewhere else then only first text file text are visbile to me.
I want to store these texts to a particular string buffer.
what i have done:
import glob
import io
Raw_txt = " "
files = [file for file in glob.glob(r'C:\\Users\\Hp\\Desktop\\RAW\\*.txt')]
for file_name in files:
with io.open(file_name, 'r') as image_file:
content1 = image_file.read()
Raw_txt = content1
print(Raw_txt)
This Raw_txt buffer only works in this loop but i want this buffer somewhere else.
Thanks!
I think the issue is related to where you load the content of your text files.
Raw_txt is overwritten with each file.
I would recommend you to do something like this where the text is appended:
import glob
Raw_txt = ""
files = [file for file in glob.glob(r'C:\\Users\\Hp\\Desktop\\RAW\\*.txt')]
for file_name in files:
with open(file_name,"r+") as file:
Raw_txt += file.read() + "\n" # I added a new line in case you want to separate the different text content of each file
print(Raw_txt)
Also in order to read a text file you don't need io module.
I'm trying to create a download function for my streamlit app. But what I currently have allows me to download a zip file via a button on my streamlit app but unfortunately it also saves it to my local folder. I don't want it to save to my local folder. The problem is when I initialize the file_zip object. I want the zip file in a specific name ideally the same name of the file that the user upload with a '.zip' extension (i.e datafile that contains the string file name as a parameter in the function). But everytime I do that it keeps saving the zip file in my local folder. Is there an alternative to this? BTW I'm trying to save list of pandas dataframe into one zip file.
def downloader(list_df, datafile, file_type):
file = datafile.name.split(".")[0]
#create zip file
with zipfile.ZipFile("{}.zip".format(file), 'w', zipfile.ZIP_DEFLATED) as file_zip:
for i in range(len(list_df)):
file_zip.writestr(file+"_group_{}".format(i)+".csv", pd.DataFrame(list_df[i]).to_csv())
file_zip.close()
#pass it to front end for download
zip_name = "{}.zip".format(file)
with open(zip_name, "rb") as f:
bytes=f.read()
b64 = base64.b64encode(bytes).decode()
href = f'Click Here To Download'
st.markdown(href, unsafe_allow_html=True)
It sounds like you want to create the zip file in memory and use it later to build a base64 encoding. You can use an io.BytesIO() object with ZipFile, rewind it, and read the data back for base64 encoding.
import io
def downloader(list_df, datafile, file_type):
file = datafile.name.split(".")[0]
#create zip file
zip_buf = io.BytesIO()
with zipfile.ZipFile(zip_buf, 'w', zipfile.ZIP_DEFLATED) as file_zip:
for i in range(len(list_df)):
file_zip.writestr(file+"_group_{}".format(i)+".csv", pd.DataFrame(list_df[i]).to_csv())
zip_buf.seek(0)
#pass it to front end for download
zip_name = "{}.zip".format(file)
b64 = base64.b64encode(zip_buf.read()).decode()
del zip_buf
href = f'Click Here To download'
st.markdown(href, unsafe_allow_html=True)
Below is code I have written that does OCR with pytesseract.
import pyperclip, os, glob, pytesseract
from PIL import Image
all_files = glob.glob('/Users/<user>/Desktop/*')
filename = max(all_files, key=os.path.getctime)
text = pytesseract.image_to_string(Image.open(filename))
pyperclip.copy(text)
Simple enough, it performs basic ocr to the image specified, the latest image that I have took a screenshot of. What I was wondering is how to put have the ocr'ed text in my clip board. I have looked into the pyperclip library, and a simple pyperclip.copy should do it. I have tried simply copying it, and everywhere says that is correct. Is there something I am missing?
That should work, but if it doesn't you can try pushing it in and out of a file.
import pyperclip, os, glob, pytesseract, shutil
from PIL import Image
all_files = glob.glob('/Users/<user>/Desktop/*')
filename = max(all_files, key=os.path.getctime)
text = pytesseract.image_to_string(Image.open(filename))
#writes text to file
file = open("/Users/<user>/pyOCR/string.txt","r+")
file.truncate(0)
file.write(text)
file.close()
#read text from file
with open('/Users/<user>/pyOCR/string.txt') as f:
lines = f.readlines()
f.close()
full_text=''
for line in lines:
full_text+=line
#copies text
pyperclip.copy(full_text)
I am downloading multiple PDFs. I have a list of urls and the code is written to download them and also create one big pdf with them all in. The code works for the first 144 pdfs then it throws this error:
PdfReadError: EOF marker not found
I've tried making all the pdfs end in %%EOF but that doesn't work - it still reaches the same point then I get the error again.
Here's my code:
my file and converting to list for python to read each separately
with open('minutelinks.txt', 'r') as file:
data = file.read()
links = data.split()
download pdfs
from PyPDF2 import PdfFileMerger
import requests
urls = links
merger = PdfFileMerger()
for url in urls:
response = requests.get(url)
title = url.split("/")[-1]
with open(title, 'wb') as f:
f.write(response.content)
merger.append(title)
merger.write("allminues.pdf")
merger.close()
I want to be able to download all of them and create one big pdf - which it appears to do until it throws this error. I have about 750 pdfs and it only gets to 144.
This is how I changed my code so it now downloads all of the pdfs and skips the one (or more) that may be correupted. I also had to add the self argument to the function.
from PyPDF2 import PdfFileMerger
import requests
import sys
urls = links
def download_pdfs(self):
merger = PdfFileMerger()
for url in urls:
try:
response = requests.get(url)
title = url.split("/")[-1]
with open(title, 'wb') as f:
f.write(response.content)
except PdfReadError:
print(title)
sys.exit()
merger.append(title)
merger.write("allminues.pdf")
merger.close()
The end of file marker '%%EOF' is meant to be the very last line. It is a kind of marker where the pdf parser knows, that the PDF document ends here.
My solution is to force this marker to stay at the end:
def reset_eof(self, pdf_file):
with open(pdf_file, 'rb') as p:
txt = (p.readlines())
for i, x in enumerate(txt[::-1]):
if b'%%EOF' in x:
actual_line = len(txt)-i-1
break
txtx = txt[:actual_line] + [b'%%EOF']
with open(pdf_file, 'wb') as f:
f.writelines(txtx)
return PyPDF4.PdfFileReader(pdf_file)
I read that EOF is a kind of tag included in PDF files. link in portuguese
However, I guess some kinds of PDF files do not have the 'EOF marker' and PyPDF2 do not recognizes those ones.
So, what I did to fix "PdfReadError: EOF marker not found" was opening my PDF with Google Chromer and print it as .pdf once more, so that the file is converted to .pdf by Chromer and hopefully with the EOF marker.
I ran my script with the new .pdf file converted by Chromer and it worked fine.