IP URL Mapping in JSON log file - python

I have a JSON log file and want to print and count the number of times a URL(requestURL) has been hit by an IP in the same log file. The output should be like the below:
IP(remoteIp): URL1-(Count), URL2-(Count), URL3...
127.0.0.1: http://www.google.com - 12, www.bing.com/servlet-server.jsp - 2, etc..
The Sample of the Logfile is like below
"insertId": "kdkddkdmdkd",
"jsonPayload": {
"#type": "type.googleapis.com/google.cloud.loadbalancing.type.LoadBalancerLogEntry",
"enforcedSecurityPolicy": {
"configuredAction": "DENY",
"outcome": "DENY",
"preconfiguredExprIds": [
"owasp-crs-v030001-id942220-sqli"
],
"name": "shbdbbddjdjdjd",
"priority": 2000
},
"statusDetails": "body_denied_by_security_policy"
},
"httpRequest": {
"requestMethod": "POST",
"requestUrl": "https://dknnkkdkddkd/token",
"requestSize": "3004",
"status": 403,
"responseSize": "274",
"userAgent": "okhttp/3.12.2",
"remoteIp": "127.0.0.1",
"serverIp": "123.123.33.31",
"latency": "0.018728s"
}
The solution that I am using is below. I am able to get the total hits per IP or how many total times a URL has been hit etc.
import json
from collections import Counter
unique_ip = {}
request_url = {}
def getAndSaveValueSafely(freqTable, searchDict, key):
try:
tmp = searchDict['httpRequest'][key]
if tmp in freqTable:
freqTable[tmp] += 1
else:
freqTable[tmp] = 1
except KeyError:
if 'not_present' in freqTable:
freqTable['not_present'] += 1
else:
freqTable['not_present'] = 1
with open("threat_intel_1.json") as file:
data = json.load(file)
for d2 in data:
getAndSaveValueSafely(unique_ip, d2, 'remoteIp')
getAndSaveValueSafely(request_url, d2, 'requestUrl')
mc_unique_ip = (dict(Counter(unique_ip).most_common()))
mc_request_url = (dict(Counter(request_url).most_common()))
def printing():
a = str(len(unique_ip))
b = str(len(request_url))
with open("output.txt", "w") as f1:
print(
f' Start Time of log = {minTs}'
f' \n\n End Time of log = {maxTs} \n\n\n {a} Unique IP List = {mc_unique_ip} \n\n\n {b} Unique URL = {mc_request_url},file=f1)

I dont think you need to use counter and are unlikely to see any benifit
from collections import defaultdict
result = {} # start empty
with open("threat_intel_1.json") as file:
data = json.load(file)
for d2 in data:
req = d2.get('httpRequest',None)
if not req:
continue
url = req['requestUrl']
ip = req['remoteIp']
result.setdefault(url,defaultdict(int))[ip] += 1
print(result)
# {"/endpoint.html": {"127.2.3.4":15,"222.11.31.22":2}}
if instead you want it the other way thats easy also
for d2 in data:
req = d2.get('httpRequest',None)
if not req:
continue
url = req['requestUrl']
ip = req['remoteIp']
result.setdefault(ip,defaultdict(int))[url] += 1
#{"127.1.2.3",{"/endpoint1.html":15,"/endpoint2.php":1},"33.44.55.66":{"/endpoint1.html":5}, ...}
instead of using defaultdict you could add a line
# result.setdefault(ip,defaultdict(int))[url] += 1
result.setdefault(ip,{})
result[ip][url] = result[ip].get(url,0) + 1
which arguably is more readable anyway...

Related

Creating multiple dataframe using loop or function

I'm trying to extract the hash rate for 3 cryptocurrencies and I have attached the code for the same below. Now, I want to pass three urls and in return I need three different different dictionaries which should have the values. I'm stuck and I don't understand how should I go about it. I have tried using loops but it is not working out for me.
url = {'Bitcoin' : 'https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y',
'Ethereum': 'https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y',
'Litecoin': 'https://bitinfocharts.com/comparison/litecoin-hashrate.html'}
for ele in url:
#### requesting the page and extracting the script which has date and values
session = requests.Session()
page = session.get(ele[i])
soup = BeautifulSoup(page.content, 'html.parser')
values = str(soup.find_all('script')[4])
values = values.split('d = new Dygraph(document.getElementById("container"),')[1]
#create an empty dict to append date and hashrates
dict([("crypto_1 %s" % i,[]) for i in range(len(url))])
#run a loop over all the dates and adding to dictionary
for i in range(values.count('new Date')):
date = values.split('new Date("')[i+1].split('"')[0]
value = values.split('"),')[i+1].split(']')[0]
dict([("crypto_1 %s" % i)[date] = value
You can use next example how to get data from all 3 URLs and create a dataframe/dictionary from it:
import re
import requests
import pandas as pd
url = {
"Bitcoin": "https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y",
"Ethereum": "https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y",
"Litecoin": "https://bitinfocharts.com/comparison/litecoin-hashrate.html",
}
data = []
for name, u in url.items():
html_doc = requests.get(u).text
for date, hash_rate in re.findall(
r'\[new Date\("(.*?)"\),(.*?)\]', html_doc
):
data.append(
{
"Name": name,
"Date": date,
"Hash Rate": float("nan")
if hash_rate == "null"
else float(hash_rate),
}
)
df = pd.DataFrame(data)
df["Date"] = pd.to_datetime(df["Date"])
# here save df to CSV
# this will create a dictionary, where the keys are crypto names and values
# are dicts with keys Date/HashRate:
out = {}
for name, g in df.groupby("Name"):
out[name] = g[["Date", "Hash Rate"]].to_dict(orient="list")
print(out)
Prints:
{
"Bitcoin": {
"Date": [
Timestamp("2009-01-03 00:00:00"),
Timestamp("2009-01-04 00:00:00"),
Timestamp("2009-01-05 00:00:00"),
...

urls.E004 path not working in Python Django

So, everytime I try to run the server it sends this error:
?: (urls.E004) Your URL pattern b'["Aquaman", "Glass", "Bumblebee", "Polar", "Bohemian Rhapsody", "Widows", "Creed II", "Escape Room", "Dragon Ball Super: Broly", "Mortal Engines", "6.9", "6.9", "6.5", "6.4", "8.2", "6.7", "6.6", "6.6", "7.5", "5.9", "297802", "450465", "424783", "483906", "424694", "401469", "480530", "522681", "503314", "428078", "143", "129", "114", "119", "135", "130", "130", "100", "101", "128", "Action", "Thriller", "Action", "Action", "Drama", "Crime", "Drama", "Horror", "Action", "Science Fiction"]' is invalid. Ensure that urlpatterns is a list of path() and/or re_path() instances.
This is the code
import requests
import json
from django.conf.urls import url, include, re_path
from django.contrib import admin
from django.http import JsonResponse, HttpResponse
from django.urls import path
def lookForValue(value, list, moves, number_of_values):
i = 0
valuesFound = []
for string in list:
if string == value and i < number_of_values:
pointer = list.index(string)
valuesFound.append(list[pointer + moves])
i += 1
list[pointer] = None
return valuesFound
def getData(apiurl):
payload = {}
data = requests.request("GET", apiurl, data=payload)
data = data.text
data = data.split('\"')
return data
url = "https://api.themoviedb.org/3/discover/movie?api_key=31563efa3563f61e00e5f0e2ed88c4bb&language=en-US&sort_by=popularity.desc&include_adult=false&include_video=false&page=1"
list = getData(url)
titles = []
release_dates = []
ratings = []
uids = []
runtimes = []
genres = []
total = []
i = 0
j = 0
titles = lookForValue('title', list, 2, 10)
release_dates = lookForValue('release_date', list, 2, 10)
ratings = lookForValue('vote_average', list, 1, 10)
uids = lookForValue('id', list, 1, 10)
for string in ratings:
ratings[ratings.index(string)] = string[1:4]
for string in uids:
uids[uids.index(string)] = string[1:7]
for string in uids:
response = getData('https://api.themoviedb.org/3/movie/' + string + '?api_key=31563efa3563f61e00e5f0e2ed88c4bb&language=en-US')
genres.append(lookForValue('genres',response, 6, 1)[0])
runtimes.append(lookForValue('runtime', response,1, 1)[0])
for i in range(0, 10):
runtimes[i] = runtimes[i][1:len(runtimes[i])]
runtimes[i] = runtimes[i][0:len(runtimes[i]) - 1]
total += titles
total += ratings
total += uids
total += runtimes
total += genres
def createJsonResponse(object):
return JsonResponse(object, safe = False)
urlpatterns = [
path('popularmovies', include(createJsonResponse(total)))
]
Thanks for the help!

CSV to elasticsearch with python SerializationError

When i try to send the bulk_data to the local elasticsearch, my data isn't loaded because of the SerializationError.
I already tried to fill the empty cells in the csv file, but that wasn't the solution.
from elasticsearch import Elasticsearch
bulk_data = []
header = []
count = 0
for row in csv_file_object:
if count > 0 :
data_dict = {}
for i in range(len(row)):
row = row.rstrip()
data_dict[header[i]] = row[i]
op_dict = {
"index": {
"_index": INDEX_NAME,
"_type": TYPE_NAME,
}
}
bulk_data.append(op_dict)
bulk_data.append(data_dict)
else:
header = row
count = count+1
# create ES client, create index
es = Elasticsearch(hosts = [ES_HOST])
if es.indices.exists(INDEX_NAME):
print("deleting '%s' index..." % (INDEX_NAME))
res = es.indices.delete(index = INDEX_NAME)
res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = True)
See image for the SerializationError and bulk_data values:
Please note: the \n is added by the serialization process itself.
I try to repond to you but I can't understand one thing. How you retrieve your field name from data? In your code I see that you retrieve it from a list called header that is empty? I can't understand how you take this value.. Check my answer i don't know if i understand well
from elasticsearch import Elasticsearch
from elasticsearch import helpers
index_name = "your_index_name"
doc_type = "your_doc_type"
esConnector = Elasticsearch(["http://192.168.1.1:9200/"])
# change your ip here
count = 0
def generate_data(csv_file_object)
with open(csv_file_object, "r") as f:
for line in f:
line = line.split(",").rstrip()
data_dict = {header[count]: line}
obj={
'_op_type': 'index',
'_index': index_name,
'_type': doc_type,
'_id': count+1,
'_source': data_dict
}
count +=1
yield obj
for success, info in helpers.parallel_bulk(client=esConnector, actions=generate_data(csv_file_object), thread_count=4):
if not success:
print 'Doc failed', info

Add entries into JSON

I am working with an API that doesn't have all the information I need in a single call, and I need to the project code it came from into the call that I am making. Right now it appends the project data to the list, but I really need it to be part of the original call. Here is my output now:
[{"committer_email": "justin.m.boucher#example.com", "short_id": "981147b9", "title": "Added .gitignore", "author_email": "justin.m.boucher#example.com", "authored_date": "2017-08-29T08:31:11.000-07:00", "created_at": "2017-08-29T08:31:11.000-07:00", "author_name": "Justin Boucher", "parent_ids": [], "committed_date": "2017-08-29T08:31:11.000-07:00", "message": "Added .gitignore\n", "committer_name": "Justin Boucher", "id": "981147b905913a60796283ce10f915c53679df49"}, {"project_id": "2"}]
Here is the output I want to achieve:
[{"project_id": "2", "committer_email": "justin.m.boucher#example.com", "short_id": "981147b9", "title": "Added .gitignore", "author_email": "justin.m.boucher#example.com", "authored_date": "2017-08-29T08:31:11.000-07:00", "created_at": "2017-08-29T08:31:11.000-07:00", "author_name": "Justin Boucher", "parent_ids": [], "committed_date": "2017-08-29T08:31:11.000-07:00", "message": "Added .gitignore\n", "committer_name": "Justin Boucher", "id": "981147b905913a60796283ce10f915c53679df49"}]
Here is my code so far:
get_commits.py:
import gitlab
import json
gitlab = gitlab.Gitlab()
projects = gitlab.getProjectID()
for i in projects:
api_str = '/projects/' + str(i) + '/repository/commits'
connect = gitlab.connectAPI(apiCall=api_str)
data = json.dumps(connect)
# Append project id to json, since it isn't created
# in the commits from Gitlab
commit = json.loads(data)
commit.append({'project_id': str(i)})
# make it pretty again for Splunk to read
commit = json.dumps(commit)
print commit
gitlab.py
import os
import ConfigParser
import requests
import json
# Setup Splunk Environment
APPNAME = 'splunk_gitlab'
CONFIG = 'appconfig.conf'
SPLUNK_HOME = os.environ['SPLUNK_HOME']
parser = ConfigParser.SafeConfigParser()
class Gitlab():
# # Load Settings
# parser.read(SPLUNK_HOME + '/etc/apps/' + APPNAME + '/local/' + CONFIG)
# if parser.has_section('Authentication'):
# pass
# else:
# parser.read(SPLUNK_HOME + '/etc/apps/' + APPNAME + '/default/' + CONFIG)
#
# GITLAB_URL = parser.get('Authentication', 'GITLAB_URL')
# API_KEY = parser.get('Authentication', 'API_KEY')
# Used for testing only
GITLAB_URL = 'http://<my_address>'
API_KEY = '<my_key>'
API_SERVER = GITLAB_URL + '/api/v4'
# Place api call to retrieve data
def connectAPI(self, apiCall='/projects'):
headers = {
'PRIVATE-TOKEN': self.API_KEY
}
final_url = self.API_SERVER + apiCall
resp = requests.get(final_url, headers=headers)
status_code = resp.status_code
resp = resp.json()
if status_code == 200:
return resp
else:
raise Exception("Something went wrong requesting (%s): %s" % (
resp['errors'][0]['errorType'], resp['errors'][0]['message']))
def getProjectID(self):
connect = self.connectAPI(apiCall='/projects')
data = json.dumps(connect)
projects = json.loads(data)
project_list = []
for i in projects:
project_list.append(i['id'])
return project_list
If you want to add a new element to the first dictionary in the list instead of appending a new dictionary to the list, try using assignment instead of append.
commit[0]['project_id'] = str(i)

Conditional Statement to re-start Python script based on response from POST request

I have a python script where I am sending a POST request for data to a server. I am expecting a particular response which indicates there is data in the response. If I do not receive this response, how can I restart my script/go to the beginning of it. The script is wrapped in a function which allows it to run every minute.
I would like to return to the beginning of my function if my response isn't as expected.
Script:
import sched, time, requests, jsonpickle, arcpy, requests, json, datetime
s = sched.scheduler(time.time, time.sleep)
def do_something(sc):
data2 = jsonpickle.decode((f2.read()))
Start = datetime.datetime.now()
# Start = datetime.datetime.strftime(data2['QueryRequest']['LastUpdatedDate'])
DD = datetime.timedelta(minutes=5)
earlier = Start - DD
earlier_str = earlier.strftime('X%m/%d/%Y %H:%M:%S').replace('X0','X').replace('X','')
data2["QueryRequest"]['LastUpdatedDate'] = str(earlier_str)
data2 = jsonpickle.encode(data2)
BulkyItemInfo = " "
spatial_ref = arcpy.SpatialReference(4326)
lastpage = 'false'
startrow = 0
newquery = 'new'
pagesize = 100
url2 = "URL"
headers2 = {'Content-type': 'text/plain', 'Accept': '/'}
while lastpage == 'false':
r2 = requests.post(url2, data=data2, headers=headers2)
print r2.text
decoded2 = json.loads(r2.text)
f2 =open('C:\Users\GeoffreyWest\Desktop\Request.json')
data2 = jsonpickle.decode((f2.read()))
if decoded2['Response']['LastPage'] == 'false':
data2['QueryRequest']['PageSize'] = pagesize
startrow = startrow + data2['QueryRequest']['PageSize']
data2['QueryRequest']['StartRowNum'] = startrow
data2['QueryRequest']['NewQuery'] = 'false'
data2 = jsonpickle.encode(data2)
print startrow
else:
lastpage = 'true'
print json.dumps(decoded2, sort_keys=True, indent=4)
items = []
for sr in decoded2['Response']['ListOfServiceRequest']['ServiceRequest']:#Where response is successful or fails
Output for successful response:
{
"status": {
"code": 311,
"message": "Service Request Successfully Queried.",
"cause": ""
},
"Response": {
"LastPage": "false",
"NumOutputObjects": "100",
"ListOfServiceRequest": {
"ServiceRequest": [
{
Output for unsuccessful response:
{"status":{"code":311,"message":"Service Request Successfully Queried.","cause":""},"Response":{"LastPage":"true","NumOutputObjects":"0","ListOfServiceRequest":{}}}

Categories

Resources