Error with reading JSON file lines in Django app - python

I have this code but I have some lines in my JSON file with empty lines. And I get this error.
This is a Custom Command and I get this error. I want to create a list of jobs in The database of my Django app, I am using a For loop. Thank a lot for your help
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 1 (char 2)
from django.core.management.base import BaseCommand
from jobs.models import Job
import json
class Command(BaseCommand):
help = 'Set up the database'
def handle(self, *args: str, **options: str):
with open('static/joblist03112020.json', 'r') as handle:
for line in handle.readlines():
print(line)
line = json.loads(line)
existing_job = Job.objects.filter(
job_title = line['job_title'],
company = line['company'],
company_url = line['company_url'],
description = line['description'],
salary = line['salary'],
city = line['city'],
district = line['district'],
url = line['url'],
job_type = line['job_type'],
)
if existing_job.exists() is False:
Job.objects.create(
job_title = line['job_title'],
company = line['company'],
company_url = line['company_url'],
description = line['description'],
salary = line['salary'],
city = line['city'],
district = line['district'],
url = line['url'],
job_type = line['job_type'],
)
Job.save()
self.stdout.write(self.style.SUCCES('added jobs!add'))

You'll probably want to load the entire JSON file instead of loading line-by-line. Not every line in a JSON file is guaranteed to be parseable in itself as a valid JSON object.
Here is the spirit of what I think you want to do:
with open('static/joblist03112020.json') as f:
job_list = json.load(f)
Note that opening with 'r' for read is the default, so I removed it.
Now you can iterate through each job in the list, and apply the filter. If your JSON object keys match the columns of your model, you may be able to simplify that code as well:
for job in job_list:
existing_job = Job.objects.filter(**job)
if not existing_job.exists():
Job.objects.create(**job)
Note that you will not need to call .save() since create includes saving. Also, the .save() method is only defined on an instance of the class anyway.

someone out of this platform helps me with my problem. this is what I did. and solve the problem. I needed to work with big JSON
def handle(self, *args: str, **options: str):
with open('static/data.json', 'r') as handle:
big_json = json.loads(handle.read())
for item in big_json:
existing_job = Job.objects.filter(

Related

Python wildcard search

I have a Lambda python function that I inherited which searches and reports on installed packages on EC2 instances. It pulls this information from SSM Inventory where the results are output to an S3 bucket. All of the installed packages have specific names until now. Now we need to report on Palo Alto Cortex XDR. The issue I'm facing is that this product includes the version number in the name and we have different versions installed. If I use the exact name (i.e. Cortex XDR 7.8.1.11343) I get reporting on that particular version but not others. I want to use a wild card to do this. I import regex (import re) on line 7 and then I change line 71 to xdr=line['Cortex*']) but it gives me the following error. I'm a bit new to Python and coding so any explanation as to what I'm doing wrong would be helpful.
File "/var/task/SoeSoftwareCompliance/RequiredSoftwareEmail.py", line 72, in build_html
xdr=line['Cortex*'])
import configparser
import logging
import csv
import json
from jinja2 import Template
import boto3
import re
# config
config = configparser.ConfigParser()
config.read("config.ini")
# logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# #TODO
# refactor common_csv_header so that we use one with variable
# so that we write all content to one template file.
def build_html(account=None,
ses_email_address=None,
recipient_email=None):
"""
:param recipient_email:
:param ses_email_address:
:param account:
"""
account_id = account["id"]
account_alias = account["alias"]
linux_ec2s = []
windows_ec2s = []
ec2s_not_in_ssm = []
excluded_ec2s = []
# linux ec2s html
with open(f"/tmp/{account_id}_linux_ec2s_required_software_report.csv", "r") as fp:
lines = csv.DictReader(fp)
for line in lines:
if line["platform-type"] == "Linux":
item = dict(id=line['instance-id'],
name=line['instance-name'],
ip=line['ip-address'],
ssm=line['amazon-ssm-agent'],
cw=line['amazon-cloudwatch-agent'],
ch=line['cloudhealth-agent'])
# skip compliant linux ec2s where are values are found
compliance_status = not all(item.values())
if compliance_status:
linux_ec2s.append(item)
# windows ec2s html
with open(f"/tmp/{account_id}_windows_ec2s_required_software_report.csv", "r") as fp:
lines = csv.DictReader(fp)
for line in lines:
if line["platform-type"] == "Windows":
item = dict(id=line['instance-id'],
name=line['instance-name'],
ip=line['ip-address'],
ssm=line['Amazon SSM Agent'],
cw=line['Amazon CloudWatch Agent'],
ch=line['CloudHealth Agent'],
mav=line['McAfee VirusScan Enterprise'],
trx=line['Trellix Agent'],
xdr=line['Cortex*'])
# skip compliant windows ec2s where are values are found
compliance_status = not all(item.values())
if compliance_status:
windows_ec2s.append(item)
# ec2s not found in ssm
with open(f"/tmp/{account_id}_ec2s_not_in_ssm.csv", "r") as fp:
lines = csv.DictReader(fp)
for line in lines:
item = dict(name=line['instance-name'],
id=line['instance-id'],
ip=line['ip-address'],
pg=line['patch-group'])
ec2s_not_in_ssm.append(item)
# display or hide excluded ec2s from report
display_excluded_ec2s_in_report = json.loads(config.get("settings", "display_excluded_ec2s_in_report"))
if display_excluded_ec2s_in_report == "true":
with open(f"/tmp/{account_id}_excluded_from_compliance.csv", "r") as fp:
lines = csv.DictReader(fp)
for line in lines:
item = dict(id=line['instance-id'],
name=line['instance-name'],
pg=line['patch-group'])
excluded_ec2s.append(item)
# pass data to html template
with open('templates/email.html') as file:
template = Template(file.read())
# pass parameters to template renderer
html = template.render(
linux_ec2s=linux_ec2s,
windows_ec2s=windows_ec2s,
ec2s_not_in_ssm=ec2s_not_in_ssm,
excluded_ec2s=excluded_ec2s,
account_id=account_id,
account_alias=account_alias)
# consolidated html with multiple tables
tables_html_code = html
client = boto3.client('ses')
client.send_email(
Destination={
'ToAddresses': [recipient_email],
},
Message={
'Body': {
'Html':
{'Data': tables_html_code}
},
'Subject': {
'Charset': 'UTF-8',
'Data': f'SOE | Software Compliance | {account_alias}',
},
},
Source=ses_email_address,
)
print(tables_html_code)
If I understand your problem correctly, you are getting a KeyError exception because Python does not support wildcards out of the box. A csv.DictReader creates a standard Python dictionary for each row in csv. Python's dictionary is just an associative array without pattern matching.
You can implement this by regex, though. If you have a dictionary line and you don't know the full name of a key you are looking for, you can solve it by re.search function.
line = {'Cortex XDR 7.8.1.11343': 'Some value you are looking for'}
val = next(v for k, v in line.items() if re.search('Cortex.+', k))
print(val) # 'Some value you are looking for'
Be aware that this assumes that a line dictionary contains at least one item that matches the 'Cortex.+' pattern and returns the first match. You would have to refactor this a bit to change this.
1. import os - missing in the code
2. def build_html(account=None -> When the account is pass with Nonetype and below error will thrown in account["id"] and account["alias"].
Ex:
Traceback (most recent call last):
File "C:\Users\pro\Documents\project\pywilds.py", line 134, in <module>
build_html(account=None)
File "C:\Users\pro\Documents\project\pywilds.py", line 33, in build_html
account_id = account["id"]
TypeError: 'NoneType' object is not subscriptable
I hope it helps..

Flask loop takes long time to complete

I have this loop in my app.py. For some reason it extends the load time by over 3 seconds. Are there any solutions?
import dateutil.parser as dp
# Converts date from ISO-8601 string to formatted string and returns it
def dateConvert(date):
return dp.parse(date).strftime("%H:%M # %e/%b/%y")
def nameFromID(userID):
if userID is None:
return 'Unknown'
else:
response = requests.get("https://example2.org/" + str(userID), headers=headers)
return response.json()['firstName'] + ' ' + response.json()['lastName']
logs = []
response = requests.get("https://example.org", headers=headers)
for response in response.json():
logs.append([nameFromID(response['member']), dateConvert(response['createdAt'])])
It extends the load time by over 3 seconds because it does a lot of unnecessary work, that's why.
You're not using requests Sessions. Each request will require creating and tearing down an HTTPS connection. That's slow.
You're doing another HTTPS request for each name conversion. (See above.)
You're parsing the JSON you get in that function twice.
Whatever dp.parse() is (dateutil?), it's probably doing a lot of extra work parsing from a free-form string. If you know the input format, use strptime.
Here's a rework that should be significantly faster. Please see the TODO points first, of course.
Also, if you are at liberty to knowing the member id -> name mapping doesn't change, you can make name_cache a suitably named global variable too (but remember it may be persisted between requests).
import datetime
import requests
INPUT_DATE_FORMAT = "TODO_FILL_ME_IN" # TODO: FILL ME IN.
def dateConvert(date: str):
return datetime.datetime.strptime(date, INPUT_DATE_FORMAT).strftime(
"%H:%M # %e/%b/%y"
)
def nameFromID(sess: requests.Session, userID):
if userID is None:
return "Unknown"
response = sess.get(f"https://example2.org/{userID}")
response.raise_for_status()
data = response.json()
return "{firstName} {lastName}".format_map(data)
def do_thing():
headers = {} # TODO: fill me in
name_cache = {}
with requests.Session() as sess:
sess.headers.update(headers)
logs = []
response = sess.get("https://example.org")
for response in response.json():
member_id = response["member"]
name = name_cache.get(member_id)
if not name:
name = name_cache[member_id] = nameFromID(sess, member_id)
logs.append([name, dateConvert(response["createdAt"])])

Adding simple menu in python

All, I wrote a small python script to parse out data from a log file. I was able to parse out what I need. Now I am trying to create a menu so that user can choose which data they want to parse out rather than all of the log content. I am having a little struggle trying to figure out how to do it, could someone please help me start on making a menu. I am a newbie to Python.
This is what I have so far:
import re
with open('temp.log') as f:
lines = f.readlines()
data = []
for line in lines:
date = re.match(r'\d{2} \w+ \d{2}', line).group()
time = line.split()[3]
ids = line.split()[4]
try:
agent = re.search(r'agent:\s(.*?),', line).group()
except:
agent = 'agent:'
try:
errID = re.search(r'ErrIdText:\s(.*?),', line).group()
except:
errID = 'ErrIdText:'
try:
clear = re.search(r'clearedID:\s(.*?)\)', line).group()
except:
clear = 'clearedID:'
row = [date, time, ids, agent, errID, clear]
data.append(row)
for row in data:
print(row)
So I want to make a menu so user can choose if they only want to parse out the date and the agent name for example.
You can use click to implement your menu through the command line. It will parse the arguments and you will be able to filter out the operations. It is also easy to understand and implement for simple stuff. For example:
import re
import click
date_pattern = re.compile(r'\d{2} \w+ \d{2}')
agent_pattern = re.compile(r'agent:\s(.*?),')
err_pattern = re.compile(r'ErrIdText:\s(.*?),')
clear_pattern = re.compile(r'clearedID:\s(.*?)\)')
#click.command()
#click.option('--filter-agent', is_flag=True, default=False, help='Filter agent')
#click.option('--filter-err-id', is_flag=True, default=False, help='Filter Error ID')
#click.option('--filter-cleared-id', is_flag=True, default=False, help='Filter Cleared ID')
#click.argument('filename')
def get_valid_rows(filter_agent, filter_err_id, filter_cleared_id, filename):
with open(filename) as f:
lines = f.readlines()
data = []
for line in lines:
date = date_pattern.match(line).group()
time = line.split()[3]
ids = line.split()[4]
row = [date, time, ids]
if filter_agent:
try:
agent = agent_pattern.search(line).group()
except:
agent = 'agent:'
row.append(agent)
if filter_err_id:
try:
errID = err_pattern.search(line).group()
except:
errID = 'ErrIdText:'
row.append(errID)
if filter_cleared_id:
try:
clear = clear_pattern.search(line).group()
except:
clear = 'clearedID:'
row.append(clear)
data.append(row)
# Do everything else
if __name__ == "__main__":
get_valid_rows()
It'll even generate a well-formatted help message for you
Usage: parselog.py [OPTIONS] FILENAME
Options:
--filter-agent Filter agent
--filter-err-id Filter Error ID
--filter-cleared-id Filter Cleared ID
--help Show this message and exit.
You could edit it to your liking to achieve exactly what you want.
That's a very large question, but what you need is either a UI (like Tkinter or Pyqt) or a command line interface (which you could implement yourself, or build using a library like docopt).
However, the command-line option will be a lot simpler to implement.

Parsing Json with multiple "levels" with Python

I'm trying to parse a json file from an api call.
I have found this code that fits my need and trying to adapt it to what I want:
import math, urllib2, json, re
def download():
graph = {}
page = urllib2.urlopen("http://fx.priceonomics.com/v1/rates/?q=1")
jsrates = json.loads(page.read())
pattern = re.compile("([A-Z]{3})_([A-Z]{3})")
for key in jsrates:
matches = pattern.match(key)
conversion_rate = -math.log(float(jsrates[key]))
from_rate = matches.group(1).encode('ascii','ignore')
to_rate = matches.group(2).encode('ascii','ignore')
if from_rate != to_rate:
if from_rate not in graph:
graph[from_rate] = {}
graph[from_rate][to_rate] = float(conversion_rate)
return graph
And I've turned it into:
import math, urllib2, json, re
def download():
graph = {}
page = urllib2.urlopen("https://bittrex.com/api/v1.1/public/getmarketsummaries")
jsrates = json.loads(page.read())
for pattern in jsrates['result'][0]['MarketName']:
for key in jsrates['result'][0]['Ask']:
matches = pattern.match(key)
conversion_rate = -math.log(float(jsrates[key]))
from_rate = matches.group(1).encode('ascii','ignore')
to_rate = matches.group(2).encode('ascii','ignore')
if from_rate != to_rate:
if from_rate not in graph:
graph[from_rate] = {}
graph[from_rate][to_rate] = float(conversion_rate)
return graph
Now the problem is that there is multiple level in the json "Result > 0, 1,2 etc"
json screenshot
for key in jsrates['result'][0]['Ask']:
I want the zero to be able to be any number, I don't know if thats clear.
So I could get all the ask price to match their marketname.
I have shortened the code so it doesnt make too long of a post.
Thanks
PS: sorry for the english, its not my native language.
You could loop through all of the result values that are returned, ignoring the meaningless numeric index:
for result in jsrates['result'].values():
ask = result.get('Ask')
if ask is not None:
# Do things with your ask...

Fetching language detection from Google api

I have a CSV with keywords in one column and the number of impressions in a second column.
I'd like to provide the keywords in a url (while looping) and for the Google language api to return what type of language was the keyword in.
I have it working manually. If I enter (with the correct api key):
http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=merde
I get:
{"responseData": {"language":"fr","isReliable":false,"confidence":6.213709E-4}, "responseDetails": null, "responseStatus": 200}
which is correct, 'merde' is French.
so far I have this code but I keep getting server unreachable errors:
import time
import csv
from operator import itemgetter
import sys
import fileinput
import urllib2
import json
E_OPERATION_ERROR = 1
E_INVALID_PARAMS = 2
#not working
def parse_result(result):
"""Parse a JSONP result string and return a list of terms"""
# Deserialize JSON to Python objects
result_object = json.loads(result)
#Get the rows in the table, then get the second column's value
# for each row
return row in result_object
#not working
def retrieve_terms(seedterm):
print(seedterm)
"""Retrieves and parses data and returns a list of terms"""
url_template = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=%(seed)s'
url = url_template % {"seed": seedterm}
try:
with urllib2.urlopen(url) as data:
data = perform_request(seedterm)
result = data.read()
except:
sys.stderr.write('%s\n' % 'Could not request data from server')
exit(E_OPERATION_ERROR)
#terms = parse_result(result)
#print terms
print result
def main(argv):
filename = argv[1]
csvfile = open(filename, 'r')
csvreader = csv.DictReader(csvfile)
rows = []
for row in csvreader:
rows.append(row)
sortedrows = sorted(rows, key=itemgetter('impressions'), reverse = True)
keys = sortedrows[0].keys()
for item in sortedrows:
retrieve_terms(item['keywords'])
try:
outputfile = open('Output_%s.csv' % (filename),'w')
except IOError:
print("The file is active in another program - close it first!")
sys.exit()
dict_writer = csv.DictWriter(outputfile, keys, lineterminator='\n')
dict_writer.writer.writerow(keys)
dict_writer.writerows(sortedrows)
outputfile.close()
print("File is Done!! Check your folder")
if __name__ == '__main__':
start_time = time.clock()
main(sys.argv)
print("\n")
print time.clock() - start_time, "seconds for script time"
Any idea how to finish the code so that it will work? Thank you!
Try to add referrer, userip as described in the docs:
An area to pay special attention to
relates to correctly identifying
yourself in your requests.
Applications MUST always include a
valid and accurate http referer header
in their requests. In addition, we
ask, but do not require, that each
request contains a valid API Key. By
providing a key, your application
provides us with a secondary
identification mechanism that is
useful should we need to contact you
in order to correct any problems. Read
more about the usefulness of having an
API key
Developers are also encouraged to make
use of the userip parameter (see
below) to supply the IP address of the
end-user on whose behalf you are
making the API request. Doing so will
help distinguish this legitimate
server-side traffic from traffic which
doesn't come from an end-user.
Here's an example based on the answer to the question "access to google with python":
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import urllib, urllib2
from pprint import pprint
api_key, userip = None, None
query = {'q' : 'матрёшка'}
referrer = "https://stackoverflow.com/q/4309599/4279"
if userip:
query.update(userip=userip)
if api_key:
query.update(key=api_key)
url = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&%s' %(
urllib.urlencode(query))
request = urllib2.Request(url, headers=dict(Referer=referrer))
json_data = json.load(urllib2.urlopen(request))
pprint(json_data['responseData'])
Output
{u'confidence': 0.070496580000000003, u'isReliable': False, u'language': u'ru'}
Another issue might be that seedterm is not properly quoted:
if isinstance(seedterm, unicode):
value = seedterm
else: # bytes
value = seedterm.decode(put_encoding_here)
url = 'http://...q=%s' % urllib.quote_plus(value.encode('utf-8'))

Categories

Resources