I am trying to download the NVD CVE. Here is my pythoncode:
import requests
import re
r = requests.get('https://nvd.nist.gov/vuln/data-feeds#JSON_FEED')
for filename in re.findall("nvdcve-1.0-[0-9]*\.json\.zip",r.text):
print(filename)
r_file = requests.get("https://static.nvd.nist.gov/feeds/json/cve/1.0/" + filename, stream=True)
with open("nvd/" + filename, 'wb') as f:
for chunk in r_file:
f.write(chunk)
Now I want to write all JSON-files ina csv-file with this format:
Name, Value, Description, ..., ...
Name, Value, Description, ..., ...
Can somebody help me?
The following should get you started, giving you two columns, ID,VendorName,DescriptionandVendorValues`:
import requests
import re
import zipfile
import io
import json
import csv
with open("nvdcve-1.0-2017.json") as f_json:
r = requests.get('https://nvd.nist.gov/vuln/data-feeds#JSON_FEED')
with open('output.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['ID', 'VendorName', 'Description', 'VersionValues'])
for filename in re.findall("nvdcve-1.0-[0-9]*\.json\.zip", r.text):
print("Downloading {}".format(filename))
r_zip_file = requests.get("https://static.nvd.nist.gov/feeds/json/cve/1.0/" + filename, stream=True)
zip_file_bytes = io.BytesIO()
for chunk in r_zip_file:
zip_file_bytes.write(chunk)
zip_file = zipfile.ZipFile(zip_file_bytes)
for json_filename in zip_file.namelist():
print("Extracting {}".format(json_filename))
json_raw = zip_file.read(json_filename).decode('utf-8')
json_data = json.loads(json_raw)
for entry in json_data['CVE_Items']:
try:
vendor_name = entry['cve']['affects']['vendor']['vendor_data'][0]['vendor_name']
except IndexError:
vendor_name = "unknown"
try:
url = entry['cve']['references']['reference_data'][0]['url']
except IndexError:
url = ''
try:
vv = []
for pd in entry['cve']['affects']['vendor']['vendor_data'][0]['product']['product_data']:
for vd in pd['version']['version_data']:
vv.append(vd['version_value'])
version_values = '/'.join(vv)
except IndexError:
version_values = ''
csv_output.writerow([
entry['cve']['CVE_data_meta']['ID'],
url,
vendor_name,
entry['cve']['description']['description_data'][0]['value'],
version_values])
This downloads the zipfile into memory. It then extracts all files one at a time into memory and converts the json into a Python datas structure using json.loads(). For each entry in CVE_Items it then extracts a couple of the fields and writes them to a CSV file.
As the JSON data is highly structured, you will need to consider how you would want to represent all of the fields in a CSV file. Currently it extras two "useful" fields and stores those.
Alternatively instead of making your own CSV you could work with Pandas:
df = pd.read_json(json_raw)
df.to_csv(f_output)
Remove the csv_output lines. This though would need some extra work to decide on how it should be formatted.
Related
I am using the code below to get any free journal pdfs from pubmed. It does downloadload something that when I look at it, just consists of the number 1.. Any ideas on where I am going wrong? Thank you
import metapub
from urllib.request import urlretrieve
import textract
from pathlib import Path
another_path='/content/Articles/'
pmid_list=['35566889','33538053', '30848212']
for i in range(len(pmid_list)):
query=pmid_list[i]
#for ind in pmid_df.index:
# query= pmid_df['PMID'][ind]
url = metapub.FindIt(query).url
try:
urlretrieve(url)
file_name = query
out_file = another_path + file_name
with open(out_file, "w") as textfile:
textfile.write(textract.process(out_file,extension='pdf',method='pdftotext',encoding="utf_8",
))
except:
continue
I see two mistakes.
First: urlretrieve(url) saves data in temporary file with random filename - so you can't access it because you don't know its filename. You should use second parameter to save it with own filename.
urlretrieve(url, file_name)
Second: you use the same out_file to process file (process(out_file)) and write result (open(out_file, 'w')) - but first you use open() which deletes all content in file and later it will process empty file. You should first process file and later open it for writing.
data = textract.process(out_file, extension='pdf', method='pdftotext', encoding="utf_8")
with open(out_file, "wb") as textfile: # save bytes
textfile.write(data)
or you should write result with different name (i.e with extension .txt)`
Full working example with other small changes
import os
from urllib.request import urlretrieve
import metapub
import textract
#another_path = '/content/Articles/'
another_path = './'
pmid_list = ['35566889','33538053', '30848212']
for query in pmid_list:
print('query:', query)
url = metapub.FindIt(query).url
print('url:', url)
if url:
try:
out_file = os.path.join(another_path, query)
print('out_file:', out_file)
print('... downloading')
urlretrieve(url, out_file + '.pdf')
print('... processing')
data = textract.process(out_file + '.pdf', extension='pdf', method='pdftotext', encoding="utf_8")
print('... saving')
with open(out_file + '.txt', "wb") as textfile: # save bytes
textfile.write(data)
print('... OK')
except Exception as ex:
print('Exception:', ex)
Hello guys hope you doing well !
I have some csv files want to put them in hdfs and if a file already exists it should append his content to the existing content I tries a script in python but with no results
import os
import pandas as pd
from os import path
import sys,json
import csv
from csv import writer,reader
data = json.load(sys.stdin)
technologies = ['KPI_2G_NPO','GPRS']
old_path = data["old.path"]
filename = data["filename"]
old_path = old_path.replace("C:\\Users\\12\\Desktop\\APACHE~1\\NIFI-1~1.1\\","")
old_path = old_path.replace("/","")
old_path_list = old_path.split('\\')
def append_list_as_row(file_name, list_of_elem):
with open(file_name, 'a+', newline='') as write_obj:
csv_writer = writer(write_obj)
csv_writer.writerow(list_of_elem)
df = pd.read_csv(data["new.path"]+data["filename"])
columns = df.columns.values.tolist()
for tech in technologies:
if (tech in filename and old_path_list[0] in filename):
if path.exists("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv"):
header_saved = True
with open(data["new.path"]+data["filename"]) as file2:
header = next(file2)
header = next(file2)
if header_saved:
for line in file2:
append_list_as_row("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv",list(line.split(",")))
os.remove(data["new.path"]+data["filename"])
else:
df.to_csv("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv")
os.remove(data["new.path"]+data["filename"])
and here's my nifi pipline picture
I'm translating some linux log data to a CSV for data analytics. Some of the instructions take some time so, I thought I would put in a progress bar for each file that is being translated. However, when putting in a progress bar with either the progresspar2 or tqdm, my pandas dataframes and null. There's no data at all. When I remove the progress bar, everything works as it should.
Here is my CSV translating function:
import pandas as pd
from dateutil import parser
from tqdm import trange
import os
import glob
import csv
import socket
def logsToCSV():
print("[+] Translating log to CSV")
log_file = open(CSV_FILE_PATH, "w", newline='')
csv_w = csv.writer(log_file)
for filename in glob.glob(os.path.join(LOGS_FILE_PATH, '*.txt')): # Find all files in path with .txt
data_file = open(filename, "r")
file_length = len(data_file.readlines())
for i in trange(file_length, desc='loop', leave=False): # Progress Bar Via TQDM
for new_line in data_file:
new_line = line.strip().split(" ")
date = str("%s %s %s" % (new_line[0], new_line[1], new_line[2])).strip()
date = parser.parse(date)
ip =str(new_line[5]).partition("/")
ip = str(ip[0]).strip()
try:
url = str(new_line[7]).strip()
except:
url = None
csv_w.writerow([date,ip,url])
TQDM is breaking something or I am implementing it incorrectly.
EDIT 1:
I figured it out. I was exhausting the file read during my readlines() to get the length. This works:
def logsToCSV():
print("[+] Translating log to CSV")
log_file = open(CSV_FILE_PATH, "w", newline='')
csv_w = csv.writer(log_file)
path, dirs, files = next(os.walk(LOGS_FILE_PATH))
log_num = len(files)
print(log_num)
for filename in glob.glob(os.path.join(LOGS_FILE_PATH, '*.txt')): # Find all files in path with .txt
data_file = open(filename, "r")
with open(filename, "r") as f:
file_length = len(f.readlines())
f.close()
pbar = tqdm(total=file_length)
for line in data_file:
new_line = line.strip().split(" ")
date = str("%s %s %s" % (new_line[0], new_line[1], new_line[2])).strip()
date = parser.parse(date)
ip =str(new_line[5]).partition("/")
ip = str(ip[0]).strip()
try:
url = str(new_line[7]).strip()
except:
url = None
csv_w.writerow([date,ip,url])
pbar.update(1)
pbar.close()
you can apply tqdm for your main loop :
from tqdm import tqdm
for i in tqdam(condition):
I have a csv file with columns: image_id, image_url
I need to download all the images from the URL and save it as the corresponding image_id as the name. Is there a way to do so?
I'm aware you can do so with python from codes that i've seen online such as
import cStringIO # *much* faster than StringIO
import urllib
import Image
try:
file =
urllib.urlopen('http://freegee.sourceforge.net/FG_EN/src/teasers_en/t_gee-power_en.gif')
im = cStringIO.StringIO(file.read()) # constructs a StringIO holding the image
img = Image.open(im)
img.save('/home/wenbert/uploaderx_files/test.gif')
except IOError, e:
raise e
but how do you reference the url and the filename from the csv
even better if i can automate the process to uploading to a GCP bucket
Appreciate any help i can get.
Cheers!
This should help. Use the csv module to parse through your CSV file.
Ex:
# -*- coding: utf-8 -*-
import csv
import cStringIO # *much* faster than StringIO
import urllib
import Image
def downloadFile(imageID, url):
try:
file = urllib.urlopen(url)
im = cStringIO.StringIO(file.read()) # constructs a StringIO holding the image
img = Image.open(im)
img.save('/home/wenbert/uploaderx_files/{0}.gif'.format(imageID))
except IOError, e:
raise e
with open('PATH_TO_.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader, None) # skip the headers
for row in reader:
print row
downloadFile(row[0], row[1])
I have produced a python script below. I have only tested this in python 3.4.3 but should do the trick.
Hope this helps.
import urllib, csv, requests, os
from pathlib import Path
spreadsheetAddress = 'C:\\SOURCE\\CSV\\FILE.csv'
targetDirectory = 'C:\\TARGET\\IMAGE\\SAVE\\LOCATION\\'
def getSpreadsheetContents(spreadsheetAddress):
with open(spreadsheetAddress) as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
imageSet = {}
for row in readCSV:
if 'image_id' not in row:
imageSet[row[0]] = row[1]
return imageSet
if __name__ == "__main__":
if os.path.exists(spreadsheetAddress) and os.path.exists(targetDirectory):
imageDict = getSpreadsheetContents(spreadsheetAddress)
for key, value in imageDict.items():
if requests.get(value).status_code == 200:
filename, file_extension = os.path.splitext(value)
address = str(targetDirectory + "\\" + key + file_extension)
urllib.request.urlretrieve(value, address)
else:
raise Exception("File not found")
I have a directory of zip files (approximately 10,000 small files), within each is a CSV file I am trying to read and split into a number of different CSV files.
I managed to write the code to split the CSV files from a directory of CSVs, shown below, that reads the first atttribute of the CSV, and depending what it is write it to the relevent CSV.
import csv
import os
import sys
import re
import glob
reader = csv.reader(open("C:/Projects/test.csv", "rb"), delimiter=',', quotechar='"')
write10 = csv.writer(open('ouput10.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
write15 = csv.writer(open('ouput15.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
headings10=["RECORD_IDENTIFIER","CUSTODIAN_NAME","LOCAL_CUSTODIAN_NAME","PROCESS_DATE","VOLUME_NUMBER","ENTRY_DATE","TIME_STAMP","VERSION","FILE_TYPE"]
write10.writerow(headings10)
headings15=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","USRN","STREET_DESCRIPTION","LOCALITY_NAME","TOWN_NAME","ADMINSTRATIVE_AREA","LANGUAGE"]
write15.writerow(headings15)
for row in reader:
type = row[0]
if "10" in type:
write10.writerow(row)
elif "15" in type:
write15.writerow(row)
So I am now trying to read the Zip files rather than wasting time extracting them first.
This is what I have so far after following as many tutorials as I have found
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv = '.'.join([dataFile, 'csv'])
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv))
reader = csv.reader(data)
for row in reader:
print row
However and error gets thrown
AttributeError: 'str' object has no attribute 'reader'
Hopefully someone can show me how to change my CSV reading code that works to read the Zip file.
Much appreciated
Tim
Simple fix. You're overriding the csv module with your local csv variable. Just change the name of that variable:
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv_file = '.'.join([dataFile, 'csv']) #all fixed
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv_file)) #don't forget this line!
reader = csv.reader(data)
for row in reader:
print row