getting weird results from metapub and pubmed - python

I am using the code below to get any free journal pdfs from pubmed. It does downloadload something that when I look at it, just consists of the number 1.. Any ideas on where I am going wrong? Thank you
import metapub
from urllib.request import urlretrieve
import textract
from pathlib import Path
another_path='/content/Articles/'
pmid_list=['35566889','33538053', '30848212']
for i in range(len(pmid_list)):
query=pmid_list[i]
#for ind in pmid_df.index:
# query= pmid_df['PMID'][ind]
url = metapub.FindIt(query).url
try:
urlretrieve(url)
file_name = query
out_file = another_path + file_name
with open(out_file, "w") as textfile:
textfile.write(textract.process(out_file,extension='pdf',method='pdftotext',encoding="utf_8",
))
except:
continue

I see two mistakes.
First: urlretrieve(url) saves data in temporary file with random filename - so you can't access it because you don't know its filename. You should use second parameter to save it with own filename.
urlretrieve(url, file_name)
Second: you use the same out_file to process file (process(out_file)) and write result (open(out_file, 'w')) - but first you use open() which deletes all content in file and later it will process empty file. You should first process file and later open it for writing.
data = textract.process(out_file, extension='pdf', method='pdftotext', encoding="utf_8")
with open(out_file, "wb") as textfile: # save bytes
textfile.write(data)
or you should write result with different name (i.e with extension .txt)`
Full working example with other small changes
import os
from urllib.request import urlretrieve
import metapub
import textract
#another_path = '/content/Articles/'
another_path = './'
pmid_list = ['35566889','33538053', '30848212']
for query in pmid_list:
print('query:', query)
url = metapub.FindIt(query).url
print('url:', url)
if url:
try:
out_file = os.path.join(another_path, query)
print('out_file:', out_file)
print('... downloading')
urlretrieve(url, out_file + '.pdf')
print('... processing')
data = textract.process(out_file + '.pdf', extension='pdf', method='pdftotext', encoding="utf_8")
print('... saving')
with open(out_file + '.txt', "wb") as textfile: # save bytes
textfile.write(data)
print('... OK')
except Exception as ex:
print('Exception:', ex)

Related

Python Progress Bar in nested loop

I'm translating some linux log data to a CSV for data analytics. Some of the instructions take some time so, I thought I would put in a progress bar for each file that is being translated. However, when putting in a progress bar with either the progresspar2 or tqdm, my pandas dataframes and null. There's no data at all. When I remove the progress bar, everything works as it should.
Here is my CSV translating function:
import pandas as pd
from dateutil import parser
from tqdm import trange
import os
import glob
import csv
import socket
def logsToCSV():
print("[+] Translating log to CSV")
log_file = open(CSV_FILE_PATH, "w", newline='')
csv_w = csv.writer(log_file)
for filename in glob.glob(os.path.join(LOGS_FILE_PATH, '*.txt')): # Find all files in path with .txt
data_file = open(filename, "r")
file_length = len(data_file.readlines())
for i in trange(file_length, desc='loop', leave=False): # Progress Bar Via TQDM
for new_line in data_file:
new_line = line.strip().split(" ")
date = str("%s %s %s" % (new_line[0], new_line[1], new_line[2])).strip()
date = parser.parse(date)
ip =str(new_line[5]).partition("/")
ip = str(ip[0]).strip()
try:
url = str(new_line[7]).strip()
except:
url = None
csv_w.writerow([date,ip,url])
TQDM is breaking something or I am implementing it incorrectly.
EDIT 1:
I figured it out. I was exhausting the file read during my readlines() to get the length. This works:
def logsToCSV():
print("[+] Translating log to CSV")
log_file = open(CSV_FILE_PATH, "w", newline='')
csv_w = csv.writer(log_file)
path, dirs, files = next(os.walk(LOGS_FILE_PATH))
log_num = len(files)
print(log_num)
for filename in glob.glob(os.path.join(LOGS_FILE_PATH, '*.txt')): # Find all files in path with .txt
data_file = open(filename, "r")
with open(filename, "r") as f:
file_length = len(f.readlines())
f.close()
pbar = tqdm(total=file_length)
for line in data_file:
new_line = line.strip().split(" ")
date = str("%s %s %s" % (new_line[0], new_line[1], new_line[2])).strip()
date = parser.parse(date)
ip =str(new_line[5]).partition("/")
ip = str(ip[0]).strip()
try:
url = str(new_line[7]).strip()
except:
url = None
csv_w.writerow([date,ip,url])
pbar.update(1)
pbar.close()
you can apply tqdm for your main loop :
from tqdm import tqdm
for i in tqdam(condition):

Downloading image from url and assign it with a specified id from csv file

I have a csv file with columns: image_id, image_url
I need to download all the images from the URL and save it as the corresponding image_id as the name. Is there a way to do so?
I'm aware you can do so with python from codes that i've seen online such as
import cStringIO # *much* faster than StringIO
import urllib
import Image
try:
file =
urllib.urlopen('http://freegee.sourceforge.net/FG_EN/src/teasers_en/t_gee-power_en.gif')
im = cStringIO.StringIO(file.read()) # constructs a StringIO holding the image
img = Image.open(im)
img.save('/home/wenbert/uploaderx_files/test.gif')
except IOError, e:
raise e
but how do you reference the url and the filename from the csv
even better if i can automate the process to uploading to a GCP bucket
Appreciate any help i can get.
Cheers!
This should help. Use the csv module to parse through your CSV file.
Ex:
# -*- coding: utf-8 -*-
import csv
import cStringIO # *much* faster than StringIO
import urllib
import Image
def downloadFile(imageID, url):
try:
file = urllib.urlopen(url)
im = cStringIO.StringIO(file.read()) # constructs a StringIO holding the image
img = Image.open(im)
img.save('/home/wenbert/uploaderx_files/{0}.gif'.format(imageID))
except IOError, e:
raise e
with open('PATH_TO_.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader, None) # skip the headers
for row in reader:
print row
downloadFile(row[0], row[1])
I have produced a python script below. I have only tested this in python 3.4.3 but should do the trick.
Hope this helps.
import urllib, csv, requests, os
from pathlib import Path
spreadsheetAddress = 'C:\\SOURCE\\CSV\\FILE.csv'
targetDirectory = 'C:\\TARGET\\IMAGE\\SAVE\\LOCATION\\'
def getSpreadsheetContents(spreadsheetAddress):
with open(spreadsheetAddress) as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
imageSet = {}
for row in readCSV:
if 'image_id' not in row:
imageSet[row[0]] = row[1]
return imageSet
if __name__ == "__main__":
if os.path.exists(spreadsheetAddress) and os.path.exists(targetDirectory):
imageDict = getSpreadsheetContents(spreadsheetAddress)
for key, value in imageDict.items():
if requests.get(value).status_code == 200:
filename, file_extension = os.path.splitext(value)
address = str(targetDirectory + "\\" + key + file_extension)
urllib.request.urlretrieve(value, address)
else:
raise Exception("File not found")

NVD - JSON to CSV with Python

I am trying to download the NVD CVE. Here is my pythoncode:
import requests
import re
r = requests.get('https://nvd.nist.gov/vuln/data-feeds#JSON_FEED')
for filename in re.findall("nvdcve-1.0-[0-9]*\.json\.zip",r.text):
print(filename)
r_file = requests.get("https://static.nvd.nist.gov/feeds/json/cve/1.0/" + filename, stream=True)
with open("nvd/" + filename, 'wb') as f:
for chunk in r_file:
f.write(chunk)
Now I want to write all JSON-files ina csv-file with this format:
Name, Value, Description, ..., ...
Name, Value, Description, ..., ...
Can somebody help me?
The following should get you started, giving you two columns, ID,VendorName,DescriptionandVendorValues`:
import requests
import re
import zipfile
import io
import json
import csv
with open("nvdcve-1.0-2017.json") as f_json:
r = requests.get('https://nvd.nist.gov/vuln/data-feeds#JSON_FEED')
with open('output.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['ID', 'VendorName', 'Description', 'VersionValues'])
for filename in re.findall("nvdcve-1.0-[0-9]*\.json\.zip", r.text):
print("Downloading {}".format(filename))
r_zip_file = requests.get("https://static.nvd.nist.gov/feeds/json/cve/1.0/" + filename, stream=True)
zip_file_bytes = io.BytesIO()
for chunk in r_zip_file:
zip_file_bytes.write(chunk)
zip_file = zipfile.ZipFile(zip_file_bytes)
for json_filename in zip_file.namelist():
print("Extracting {}".format(json_filename))
json_raw = zip_file.read(json_filename).decode('utf-8')
json_data = json.loads(json_raw)
for entry in json_data['CVE_Items']:
try:
vendor_name = entry['cve']['affects']['vendor']['vendor_data'][0]['vendor_name']
except IndexError:
vendor_name = "unknown"
try:
url = entry['cve']['references']['reference_data'][0]['url']
except IndexError:
url = ''
try:
vv = []
for pd in entry['cve']['affects']['vendor']['vendor_data'][0]['product']['product_data']:
for vd in pd['version']['version_data']:
vv.append(vd['version_value'])
version_values = '/'.join(vv)
except IndexError:
version_values = ''
csv_output.writerow([
entry['cve']['CVE_data_meta']['ID'],
url,
vendor_name,
entry['cve']['description']['description_data'][0]['value'],
version_values])
This downloads the zipfile into memory. It then extracts all files one at a time into memory and converts the json into a Python datas structure using json.loads(). For each entry in CVE_Items it then extracts a couple of the fields and writes them to a CSV file.
As the JSON data is highly structured, you will need to consider how you would want to represent all of the fields in a CSV file. Currently it extras two "useful" fields and stores those.
Alternatively instead of making your own CSV you could work with Pandas:
df = pd.read_json(json_raw)
df.to_csv(f_output)
Remove the csv_output lines. This though would need some extra work to decide on how it should be formatted.

How to save data from python into a csv file

I've got a program that on the end prints a "match" I wanted to save the data in this "match" to a csv file, how can I do that? I've wrote some code, to save this variable, but it doesn't write anything
Here's my code:
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import csv
import pickle
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)\s*(\S*)", text)
print(match)
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
writer.writerows('%s' %match)
The part where I tried to save it into a csv file is the last 3 lines of code.
Here's a print of the "match" format: https://gyazo.com/930f9dad12109bc50825c91b51fb31f3
the way your code is structured, you iterate over the matches in your for loop, then, when the loop is finished, you save the last match in your CSV. You probably want to write each match in your CSV instead, inside the for loop.
try to replace the last lines of your code (starting at the last for loop) by:
with open('score.csv', 'wt') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)\s*(\S*)", text)
print(match)
writer.writerow(match)
Assuming you already have your "match", you can use the CSV module in Python. The writer should get your job done.
It would be more helpful if you could elaborate on the format of your data.

renaming a list of pdf files with for loop

i am trying to rename a list of pdf files by extracting the name from the file using PyPdf. i tried to use a for loop to rename the files but i always get an error with code 32 saying that the file is being used by another process. I am using python2.7
Here's my code
import os, glob
from pyPdf import PdfFileWriter, PdfFileReader
# this function extracts the name of the file
def getName(filepath):
output = PdfFileWriter()
input = PdfFileReader(file(filepath, "rb"))
output.addPage(input.getPage(0))
outputStream = file(filepath + '.txt', 'w')
output.write(outputStream)
outputStream.close()
outText = open(filepath + '.txt', 'rb')
textString = outText.read()
outText.close()
nameStart = textString.find('default">')
nameEnd = textString.find('_SATB', nameStart)
nameEnd2 = textString.find('</rdf:li>', nameStart)
if nameStart:
testName = textString[nameStart+9:nameEnd]
if len(testName) <= 100:
name = testName + '.pdf'
else:
name = textString[nameStart+9:nameEnd2] + '.pdf'
return name
pdfFiles = glob.glob('*.pdf')
m = len(pdfFiles)
for each in pdfFiles:
newName = getName(each)
os.rename(each, newName)
Consider using the with directive of Python. With it you do not need to handle closing the file yourself:
def getName(filepath):
output = PdfFileWriter()
with file(filepath, "rb") as pdfFile:
input = PdfFileReader(pdfFile)
...
You're not closing the input stream (the file) used by the pdf reader.
Thus, when you try to rename the file, it's still open.
So, instead of this:
input = PdfFileReader(file(filepath, "rb"))
Try this:
inputStream = file(filepath, "rb")
input = PdfFileReader(inputStream)
(... when done with this file...)
inputStream.close()
It does not look like you close the file object associated with the PDF reader object. Though maybe at tne end of the function it is closed automatically, but to be sure you might want to create a separate file object which you pass to the PdfFileReader and then close the file handle when done. Then rename.
The below was from SO: How to close pyPDF "PdfFileReader" Class file handle
import os.path
from pyPdf import PdfFileReader
fname = 'my.pdf'
fh = file(fname, "rb")
input = PdfFileReader(fh)
fh.close()
os.rename(fname, 'my_renamed.pdf')

Categories

Resources