So, everytime I try to run the server it sends this error:
?: (urls.E004) Your URL pattern b'["Aquaman", "Glass", "Bumblebee", "Polar", "Bohemian Rhapsody", "Widows", "Creed II", "Escape Room", "Dragon Ball Super: Broly", "Mortal Engines", "6.9", "6.9", "6.5", "6.4", "8.2", "6.7", "6.6", "6.6", "7.5", "5.9", "297802", "450465", "424783", "483906", "424694", "401469", "480530", "522681", "503314", "428078", "143", "129", "114", "119", "135", "130", "130", "100", "101", "128", "Action", "Thriller", "Action", "Action", "Drama", "Crime", "Drama", "Horror", "Action", "Science Fiction"]' is invalid. Ensure that urlpatterns is a list of path() and/or re_path() instances.
This is the code
import requests
import json
from django.conf.urls import url, include, re_path
from django.contrib import admin
from django.http import JsonResponse, HttpResponse
from django.urls import path
def lookForValue(value, list, moves, number_of_values):
i = 0
valuesFound = []
for string in list:
if string == value and i < number_of_values:
pointer = list.index(string)
valuesFound.append(list[pointer + moves])
i += 1
list[pointer] = None
return valuesFound
def getData(apiurl):
payload = {}
data = requests.request("GET", apiurl, data=payload)
data = data.text
data = data.split('\"')
return data
url = "https://api.themoviedb.org/3/discover/movie?api_key=31563efa3563f61e00e5f0e2ed88c4bb&language=en-US&sort_by=popularity.desc&include_adult=false&include_video=false&page=1"
list = getData(url)
titles = []
release_dates = []
ratings = []
uids = []
runtimes = []
genres = []
total = []
i = 0
j = 0
titles = lookForValue('title', list, 2, 10)
release_dates = lookForValue('release_date', list, 2, 10)
ratings = lookForValue('vote_average', list, 1, 10)
uids = lookForValue('id', list, 1, 10)
for string in ratings:
ratings[ratings.index(string)] = string[1:4]
for string in uids:
uids[uids.index(string)] = string[1:7]
for string in uids:
response = getData('https://api.themoviedb.org/3/movie/' + string + '?api_key=31563efa3563f61e00e5f0e2ed88c4bb&language=en-US')
genres.append(lookForValue('genres',response, 6, 1)[0])
runtimes.append(lookForValue('runtime', response,1, 1)[0])
for i in range(0, 10):
runtimes[i] = runtimes[i][1:len(runtimes[i])]
runtimes[i] = runtimes[i][0:len(runtimes[i]) - 1]
total += titles
total += ratings
total += uids
total += runtimes
total += genres
def createJsonResponse(object):
return JsonResponse(object, safe = False)
urlpatterns = [
path('popularmovies', include(createJsonResponse(total)))
]
Thanks for the help!
Related
I have a list filled with paragraph tags (<p></p>) content from a site using beautifulsoup4.
I would like to break that list into a nested list with the sublists name being dynamically incremented and have this incrementation to be based on a byte size check of the current nested list. The result should be used to create a json object afterwards.
My current code for example:
import requests, sys
from bs4 import BeautifulSoup
def getContent():
page = requests.get("www.example.com")
soup = BeautifulSoup(page.content, "html.parser")
results = soup.prettify()
data = {}
SECTION_INDEX = 1
data_container = []
article_section_size = 0
article_section_data = []
for tag in soup.find_all("p"):
text = tag.text
data_container.append(text)
for p in data_container:
article_section = "CONTENT_SECTION_" + str(SECTION_INDEX)
article_section_data.append(p)
data[article_section] = article_section_data
article_section_size += sys.getsizeof(p)
if article_section_size >= 300:
SECTION_INDEX = SECTION_INDEX + 1
return(data)
def createJson():
data = getContent()
json_source = {
"ARTICLE_DATA": data
}
json_object = json.dumps(json_source, indent=2)
def main():
createJson()
The actual result:
{
"CONTENT_DATA": {
"CONTENT_SECTION_1": [
"the actual paragraphs",
"content goes there",
"some more content".
"even more content from the site",
"and some even more",
"and finally, some more"
],
"CONTENT_SECTION_2": [
"the actual paragraphs",
"content goes there",
"some more content".
"even more content from the site",
"and some even more",
"and finally, some more"
],
"CONTENT_SECTION_3": [
"the actual paragraphs",
"content goes there",
"some more content".
"even more content from the site",
"and some even more",
"and finally, some more"
]
}
}
The desired result:
{
"CONTENT_DATA": {
"CONTENT_SECTION_1": [
"the actual paragraphs",
"content goes there"
],
"CONTENT_SECTION_2": [
"some more content",
"even more content from the site"
],
"CONTENT_SECTION_3": [
"and some even more",
"and finally, some more"
]
}
}
How to achieve this and why the repeated pattern from the actual result above?
For anyone interested in the solution, the code to achieve the desired result is the following:
import requests, sys
from bs4 import BeautifulSoup
def getContent():
page = requests.get("www.example.com")
soup = BeautifulSoup(page.content, "html.parser")
results = soup.prettify()
data = {}
SECTION_INDEX = 1
data_container = []
article_section_size = 0
article_section_data = []
for tag in soup.find_all("p"):
text = tag.text
data_container.append(text)
for p in data container:
article_section = "CONTENT_SECTION_" + str(SECTION_INDEX)
article_section_data.append(p)
article_section_size += sys.getsizeof(p)
if article_section_size >= 300:
data[article_section] = article_section_data
article_section_data = []
article_section_size = 0
SECTION_INDEX = SECTION_INDEX + 1
if article_section_data:
data[article_section] = article_section_data
return(data)
The repeated pattern from the actual result was due to the fact that I was always appending the p element to the article_section_data list, rather than resetting it to an empty list when the desired byte size has been reached.
I have a JSON log file and want to print and count the number of times a URL(requestURL) has been hit by an IP in the same log file. The output should be like the below:
IP(remoteIp): URL1-(Count), URL2-(Count), URL3...
127.0.0.1: http://www.google.com - 12, www.bing.com/servlet-server.jsp - 2, etc..
The Sample of the Logfile is like below
"insertId": "kdkddkdmdkd",
"jsonPayload": {
"#type": "type.googleapis.com/google.cloud.loadbalancing.type.LoadBalancerLogEntry",
"enforcedSecurityPolicy": {
"configuredAction": "DENY",
"outcome": "DENY",
"preconfiguredExprIds": [
"owasp-crs-v030001-id942220-sqli"
],
"name": "shbdbbddjdjdjd",
"priority": 2000
},
"statusDetails": "body_denied_by_security_policy"
},
"httpRequest": {
"requestMethod": "POST",
"requestUrl": "https://dknnkkdkddkd/token",
"requestSize": "3004",
"status": 403,
"responseSize": "274",
"userAgent": "okhttp/3.12.2",
"remoteIp": "127.0.0.1",
"serverIp": "123.123.33.31",
"latency": "0.018728s"
}
The solution that I am using is below. I am able to get the total hits per IP or how many total times a URL has been hit etc.
import json
from collections import Counter
unique_ip = {}
request_url = {}
def getAndSaveValueSafely(freqTable, searchDict, key):
try:
tmp = searchDict['httpRequest'][key]
if tmp in freqTable:
freqTable[tmp] += 1
else:
freqTable[tmp] = 1
except KeyError:
if 'not_present' in freqTable:
freqTable['not_present'] += 1
else:
freqTable['not_present'] = 1
with open("threat_intel_1.json") as file:
data = json.load(file)
for d2 in data:
getAndSaveValueSafely(unique_ip, d2, 'remoteIp')
getAndSaveValueSafely(request_url, d2, 'requestUrl')
mc_unique_ip = (dict(Counter(unique_ip).most_common()))
mc_request_url = (dict(Counter(request_url).most_common()))
def printing():
a = str(len(unique_ip))
b = str(len(request_url))
with open("output.txt", "w") as f1:
print(
f' Start Time of log = {minTs}'
f' \n\n End Time of log = {maxTs} \n\n\n {a} Unique IP List = {mc_unique_ip} \n\n\n {b} Unique URL = {mc_request_url},file=f1)
I dont think you need to use counter and are unlikely to see any benifit
from collections import defaultdict
result = {} # start empty
with open("threat_intel_1.json") as file:
data = json.load(file)
for d2 in data:
req = d2.get('httpRequest',None)
if not req:
continue
url = req['requestUrl']
ip = req['remoteIp']
result.setdefault(url,defaultdict(int))[ip] += 1
print(result)
# {"/endpoint.html": {"127.2.3.4":15,"222.11.31.22":2}}
if instead you want it the other way thats easy also
for d2 in data:
req = d2.get('httpRequest',None)
if not req:
continue
url = req['requestUrl']
ip = req['remoteIp']
result.setdefault(ip,defaultdict(int))[url] += 1
#{"127.1.2.3",{"/endpoint1.html":15,"/endpoint2.php":1},"33.44.55.66":{"/endpoint1.html":5}, ...}
instead of using defaultdict you could add a line
# result.setdefault(ip,defaultdict(int))[url] += 1
result.setdefault(ip,{})
result[ip][url] = result[ip].get(url,0) + 1
which arguably is more readable anyway...
I have tried using import pandas as pd and am able to produce a csv but it is not how I want it to be. I want it to be saved/printed across the columns rather than down as rows (so job title as the header for column 'A' and all the results listed in the rows etc...
This is my code, what do I need to change to flip the results to appear the other way with all my results.
from requests_html import HTMLSession
import re
import pandas as pd
url = 'https://company.onefootball.com/jobs/#jobs-wrap'
departmentcategories = {
"android": "Software Development",
"social media": "Marketing",
"content ": "Marketing",
"sales": "Sales",
}
languagecategories = {
" and ": "English",
" und ": "German",
}
experiencecategories = {
"senior": "Mid Senior Level",
"Junior": "Entry Level",
}
s = HTMLSession()
r = s.get(url)
r.html.render(sleep=1)
jobs = r.html.xpath('//*[#id="jobs-wrap"]', first=True)
def get_department_categories(department):
depcats = []
for k, v in departmentcategories.items():
if re.search(k, department, re.IGNORECASE):
depcats.append(v)
return depcats
def get_language_categories(language):
langcats = []
for k, v in languagecategories.items():
if re.search(k, language, re.IGNORECASE):
langcats.append(v)
return langcats
def get_experience_categories(experience):
expcats = []
for k, v in experiencecategories.items():
if re.search(k, experience, re.IGNORECASE):
expcats.append(v)
return expcats
for item in jobs.absolute_links:
r = s.get(item)
job_title = r.html.find('h1.headline', first=True).text
city = r.html.find('p.h6', first=True).text
if city == ('Berlin, Germany'):
city = 'Berlin'
country = r.html.find('p.h6', first=True).text
if country == ('Berlin, Germany'):
country = 'Germany'
#Section for the department, languages, and experience level
department = r.html.find('div.job-content--parsed', first=True).text
department_cats = get_department_categories(department)
language = r.html.xpath('//*[#id="job"]/div[1]/div[2]', first=True).text
language_cats = get_language_categories(language)
experience = r.html.find('div.job-content--parsed', first=True).text
experience_cats = get_experience_categories(experience)
joblist = [job_title, city, country, "OneFootball", ", ".join(department_cats), ", ".join(experience_cats), ", ".join(language_cats), "Sport", item]
df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('newtest.csv')
I have this list of hierarchical URLs:
data = ["https://python-rq.org/","https://python-rq.org/a","https://python-rq.org/a/b","https://python-rq.org/c"]
And I want to dynamically make a nested dictionary for every URL for which there exists another URL that is a subdomain/subfolder of it.
I already tried the follwoing but it is not returning what I expect:
result = []
for key,d in enumerate(data):
form_dict = {}
r_pattern = re.search(r"(http(s)?://(.*?)/)(.*)",d)
r = r_pattern.group(4)
if r == "":
parent_url = r_pattern.group(3)
else:
parent_url = r_pattern.group(3) + "/"+r
print(parent_url)
temp_list = data.copy()
temp_list.pop(key)
form_dict["name"] = parent_url
form_dict["children"] = []
for t in temp_list:
child_dict = {}
if parent_url in t:
child_dict["name"] = t
form_dict["children"].append(child_dict.copy())
result.append(form_dict)
This is the expected output.
{
"name":"https://python-rq.org/",
"children":[
{
"name":"https://python-rq.org/a",
"children":[
{
"name":"https://python-rq.org/a/b",
"children":[
]
}
]
},
{
"name":"https://python-rq.org/c",
"children":[
]
}
]
}
Any advice?
This was a nice problem. I tried going on with your regex method but got stuck and found out that split was actually appropriate for this case. The following works:
data = ["https://python-rq.org/","https://python-rq.org/a","https://python-rq.org/a/b","https://python-rq.org/c"]
temp_list = data.copy()
# This removes the last "/" if any URL ends with one. It makes it a lot easier
# to match the URLs and is not necessary to have a correct link.
data = [x[:-1] if x[-1]=="/" else x for x in data]
print(data)
result = []
# To find a matching parent
def find_match(d, res):
for t in res:
if d == t["name"]:
return t
elif ( len(t["children"])>0 ):
temp = find_match(d, t["children"])
if (temp):
return temp
return None
while len(data) > 0:
d = data[0]
form_dict = {}
l = d.split("/")
# I removed regex as matching the last parentheses wasn't working out
# split does just what you need however
parent = "/".join(l[:-1])
data.pop(0)
form_dict["name"] = d
form_dict["children"] = []
option = find_match(parent, result)
if (option):
option["children"].append(form_dict)
else:
result.append(form_dict)
print(result)
[{'name': 'https://python-rq.org', 'children': [{'name': 'https://python-rq.org/a', 'children': [{'name': 'https://python-rq.org/a/b', 'children': []}]}, {'name': 'https://python-rq.org/c', 'children': []}]}]
I am working with an API that doesn't have all the information I need in a single call, and I need to the project code it came from into the call that I am making. Right now it appends the project data to the list, but I really need it to be part of the original call. Here is my output now:
[{"committer_email": "justin.m.boucher#example.com", "short_id": "981147b9", "title": "Added .gitignore", "author_email": "justin.m.boucher#example.com", "authored_date": "2017-08-29T08:31:11.000-07:00", "created_at": "2017-08-29T08:31:11.000-07:00", "author_name": "Justin Boucher", "parent_ids": [], "committed_date": "2017-08-29T08:31:11.000-07:00", "message": "Added .gitignore\n", "committer_name": "Justin Boucher", "id": "981147b905913a60796283ce10f915c53679df49"}, {"project_id": "2"}]
Here is the output I want to achieve:
[{"project_id": "2", "committer_email": "justin.m.boucher#example.com", "short_id": "981147b9", "title": "Added .gitignore", "author_email": "justin.m.boucher#example.com", "authored_date": "2017-08-29T08:31:11.000-07:00", "created_at": "2017-08-29T08:31:11.000-07:00", "author_name": "Justin Boucher", "parent_ids": [], "committed_date": "2017-08-29T08:31:11.000-07:00", "message": "Added .gitignore\n", "committer_name": "Justin Boucher", "id": "981147b905913a60796283ce10f915c53679df49"}]
Here is my code so far:
get_commits.py:
import gitlab
import json
gitlab = gitlab.Gitlab()
projects = gitlab.getProjectID()
for i in projects:
api_str = '/projects/' + str(i) + '/repository/commits'
connect = gitlab.connectAPI(apiCall=api_str)
data = json.dumps(connect)
# Append project id to json, since it isn't created
# in the commits from Gitlab
commit = json.loads(data)
commit.append({'project_id': str(i)})
# make it pretty again for Splunk to read
commit = json.dumps(commit)
print commit
gitlab.py
import os
import ConfigParser
import requests
import json
# Setup Splunk Environment
APPNAME = 'splunk_gitlab'
CONFIG = 'appconfig.conf'
SPLUNK_HOME = os.environ['SPLUNK_HOME']
parser = ConfigParser.SafeConfigParser()
class Gitlab():
# # Load Settings
# parser.read(SPLUNK_HOME + '/etc/apps/' + APPNAME + '/local/' + CONFIG)
# if parser.has_section('Authentication'):
# pass
# else:
# parser.read(SPLUNK_HOME + '/etc/apps/' + APPNAME + '/default/' + CONFIG)
#
# GITLAB_URL = parser.get('Authentication', 'GITLAB_URL')
# API_KEY = parser.get('Authentication', 'API_KEY')
# Used for testing only
GITLAB_URL = 'http://<my_address>'
API_KEY = '<my_key>'
API_SERVER = GITLAB_URL + '/api/v4'
# Place api call to retrieve data
def connectAPI(self, apiCall='/projects'):
headers = {
'PRIVATE-TOKEN': self.API_KEY
}
final_url = self.API_SERVER + apiCall
resp = requests.get(final_url, headers=headers)
status_code = resp.status_code
resp = resp.json()
if status_code == 200:
return resp
else:
raise Exception("Something went wrong requesting (%s): %s" % (
resp['errors'][0]['errorType'], resp['errors'][0]['message']))
def getProjectID(self):
connect = self.connectAPI(apiCall='/projects')
data = json.dumps(connect)
projects = json.loads(data)
project_list = []
for i in projects:
project_list.append(i['id'])
return project_list
If you want to add a new element to the first dictionary in the list instead of appending a new dictionary to the list, try using assignment instead of append.
commit[0]['project_id'] = str(i)