Organizing Results in Python - python

Alright, so basically I have a Google script that searches for a keyword. The results look like:
http://www.example.com/user/1234
http://www.youtube.com/user/125
http://www.forum.com/user/12
What could I do to organize these results like this?:
Forums:
http://www.forum.com/user/12
YouTubes:
http://www.youtube.com/user/125
Unidentified:
http://www.example.com/user/1234
By the way I'm organizing them with keywords. If the url has "forum" in it then it goes to the forum list, if it has YouTube it goes to the YouTube list, but if no keywords match up then it goes to unidentified.

1/. Create a dict, and assign an empty list to each keyword you have.
eg
my_dict = {'forums':[],'youtube':[],'unidentified':[]}
2/.Iterate over your urls.
3/. Generate a key for your url,domain name in your case, you can extract the key using re regex module.
4/ Check the dictionary ( of step#1) for this key, if it does not exist, assign it to 'unidentified key, if it exists, append this url to the list in the dictionary with that key.

Something like this? I guess you will be able to adapt this example to your needs
import pprint
import re
urls = ['http://www.example.com/user/1234',
'http://www.youtube.com/user/126',
'http://www.youtube.com/user/125',
'http://www.forum.com/useryoutube/12']
pattern = re.compile('//www\.(\w+)\.')
keys = ['forum', 'youtube']
results = dict()
for u in urls:
ms = pattern.search(u)
key = ms.group(1)
if key in keys:
results.setdefault(key, []).append(u)
pprint.pprint(results)

import urlparse
urls = """
http://www.example.com/user/1234
http://www.youtube.com/user/125
http://www.forum.com/user/12
""".split()
categories = {
"youtube.com": [],
"forum.com": [],
"unknown": [],
}
for url in urls:
netloc = urlparse.urlparse(url).netloc
if netloc.count(".") == 2:
# chop sub-domain
netloc = netloc.split(".", 1)[1]
if netloc in categories:
categories[netloc].append(url)
else:
categories["unknown"].append(url)
print categories
Parse the urls. Find the category. Append the full url

You should probably keep your sorted results in a dictionary and the unsorted ones in a list. You could then sort it like so:
categorized_results = {"forum": [], "youtube": []}
uncategorized_results = []
for i in results:
i = i.split(".")
for k in categorized_results:
j = True
if k in i:
categorized_results[k].append(i)
j = False
if j:
uncategorized_results.append(i)
If you'd like to output it neatly:
category_aliases: {"forum": "Forums:", "youtube": "Youtubes:"}
for i in categorized_results:
print(category_aliases[i])
for j in categorized_results[i]:
print(j)
print("\n")
print("Unidentified:")
print("\n".join(uncategorized_results)) # Let's not put in another for loop.

How about this:
from urlparse import urlparse
class Organizing_Results(object):
CATEGORY = {'example': [], 'youtube': [], 'forum': []}
def __init__(self):
self.url_list = []
def add_single_url(self, url):
self.url_list.append(urlparse(url))
def _reduce_result_list(self, acc, element):
for c in self.CATEGORY:
if c in element[1]:
return self.CATEGORY[c].append(element)
return self.CATEGORY['example'].append(element)
def get_result(self):
reduce(lambda x, y: c._reduce_result_list(x, y), c.url_list, [])
return self.CATEGORY
c = Organizing_Results()
c.add_single_url('http://www.example.com/user/1234')
c.add_single_url('http://www.youtube.com/user/1234')
c.add_single_url('http://www.unidentified.com/user/1234')
c.get_result()
You can easy broaden the class with more functions as you need.

Related

Writing scraped headers from webpages to pandas frame

Wrote this code to download h1,h2 and h3 headers and write to a pandas frame along with a list of urls but it gives error as unpacking error expected 3 values.
def url_corrector(url):
if not str(url).startswith('http'):
return "https://"+str(url)
else:
return str(url)
def header_agg(url):
h1_list = []
h2_list = []
h3_list = []
p = requests.get(url_corrector(url),proxies = proxy_data,verify=False)
soup = BeautifulSoup(p.text,'lxml')
for tag in soup.find_all('h1'):
h1_list.append(tag.text)
for tag in soup.find_all('h2'):
h2_list.append(tag.text)
for tag in soup.find_all('h3'):
h3_list.append(tag.text)
return h1_list, h2_list, h3_list
headers_frame = url_list.copy()
headers_frame['H1'],headers_frame['H2'],headers_frame['H3'] = headers_frame.url.map(lambda x: header_agg(x))
Any help on how to do it?
Getting this error:
ValueError: too many values to unpack (expected 3)
Lets assume that url_list is a dict with the following structure:
url_list = {'url': [<url1>, <url2>, <url3>, <url4>, ..., <urln>]}
the call to headers_frame.url.map(lambda x: header_agg(x)) will return a list with n elements in the form:
[<url1(h1_list, h2_list, h3_list)>, <url2(h1_list, h2_list, h3_list)>, ..., <urln(h1_list, h2_list, h3_list)>]
For the code to produce the output you require, you may have to re-write the last statement as a loop
headers_frame.update({'H1':[], 'H2':[], 'H3':[]})
for url in headers_frame.url:
headers = header_agg(url)
headers_frame['H1'].extend(headers[0])
headers_frame['H2'].extend(headers[1])
headers_frame['H3'].extend(headers[2])
You have to return one entity. Just change:
return [h1_list, h2_list, h3_list]
Did this to work around this issue. However, still unsure why the original isn't working.
headers_frame = url_list.copy()
H1=[]
H2=[]
H3=[]
for url in headers_frame.url:
k = header_agg(url)
H1.append(k[0])
H2.append(k[1])
H3.append(k[2])
pd.DataFrame(np.column_stack([headers_frame.url,H1,H2,H3]))

Parsing and creating a dict python

I would like to create a dict by parsing a string
<brns ret = "Herld" other = "very">
<brna name = "ame1">
I would like to create a dict that has the following key-value pairs:
dict = {'brnsret': 'Herld',
'brnsother':'very',
'brnaname':'ame1'}
I have a working script that can handle this:
<brns ret = "Herld">
<brna name = "ame1">
my Code to generate the dict:
match_tag = re.search('<(\w+)\s(\w+) = \"(\w+)\">', each_par_line)
if match_tag is not None:
dict_tag[match_tag.group(1)+match_tag.group(2)] = match_tag.group(3)
But how should I tweak my script to handle more than one attribute pair in a tag?
Thanks
An alternative option and, probably, just for educational reasons - you can pass this kind of string into a lenient HTML parser like BeautifulSoup:
from bs4 import BeautifulSoup
data = """
<brns ret = "Herld" other = "very">
<brna name = "ame1">
"""
d = {tag.name + attr: value
for tag in BeautifulSoup(data, "html.parser")()
for attr, value in tag.attrs.items()}
print(d)
Prints:
{'brnaname': 'ame1', 'brnsother': 'very', 'brnsret': 'Herld'}

Python - Getting Attributes From A File of Constants

I have a file of constant variables that I need to query and I am not sure how to go about it.
I have a database query which is returning user names and I need to find the matching user name in the file of constant variables.
The file looks like this:
SALES_MANAGER_01 = {"user_name": "BO01", "password": "password", "attend_password": "BO001",
"csm_password": "SM001", "employee_num": "BOSM001"}
There is just a bunch of users just like the one above.
My function looks like this:
#attr("user_test")
def test_get_user_for_login(self):
application_code = 'BO'
user_from_view = self.select_user_for_login(application_code=application_code)
users = [d['USER'] for d in user_from_view]
user_with_ent = choice(users)
user_wo_ent = user_with_ent[-4:]
password = ""
global_users = dir(gum)
for item in global_users:
if user_wo_ent not in item.__getattr__("user_name"):
user_with_ent = choice(users)
user_wo_ent = user_with_ent[-4:]
else:
password = item.__getattr__("password")
print(user_wo_ent, password)
global_users = dir(gum) is my file of constants. So I know I am doing something wrong since I am getting an attribute error AttributeError: 'str' object has no attribute '__getattr__', I am just not sure how to go about resolving it.
You should reverse your looping as you want to compare each item to your match condition. Also, you have a dictionary, so use it to do some heavy lifting.
You need to add some imports
import re
from ast import literal_eval
I've changed the dir(gum) bit to be this function.
def get_global_users(filename):
gusers = {} # create a global users dict
p_key = re.compile(ur'\b\w*\b') # regex to get first part, e.g.. SALES_MANAGER_01
p_value = re.compile(ur'\{.*\}') # regex to grab everything in {}
with (open(filename)) as f: # open the file and work through it
for line in f: # for each line
gum_key = p_key.match(line) # pull out the key
gum_value = p_value.search(line) # pull out the value
''' Here is the real action. update a dictionary
with the match of gum_key and with match of gum_value'''
gusers[gum_key.group()] = literal_eval(gum_value.group())
return(gusers) # return the dictionary
The bottom of your existing code is replaced with this.
global_users = get_global_users(gum) # assign return to global_users
for key, value in global_users.iteritems(): # walk through all key, value pairs
if value['user_name'] != user_wo_ent:
user_with_ent = choice(users)
user_wo_ent = user_with_ent[-4:]
else:
password = value['password']
So a very simple answer was get the dir of the constants file then parsing over it like so:
global_users = dir(gum)
for item in global_users:
o = gum.__dict__[item]
if type(o) is not dict:
continue
if gum.__dict__[item].get("user_name") == user_wo_ent:
print(user_wo_ent, o.get("password"))
else:
print("User was not in global_user_mappings")
I was able to find the answer by doing the following:
def get_user_for_login(application_code='BO'):
user_from_view = BaseServiceTest().select_user_for_login(application_code=application_code)
users = [d['USER'] for d in user_from_view]
user_with_ent = choice(users)
user_wo_ent = user_with_ent[4:]
global_users = dir(gum)
user_dict = {'user_name': '', 'password': ''}
for item in global_users:
o = gum.__dict__[item]
if type(o) is not dict:
continue
if user_wo_ent == o.get("user_name"):
user_dict['user_name'] = user_wo_ent
user_dict['password'] = o.get("password")
return user_dict

Creating nested Json structure with multiple key values in Python from Json

My code is as follows:
import json
def reformat(importscompanies):
#print importscompanies
container={}
child=[]
item_dict={}
for name, imports in importscompanies.iteritems():
item_dict['name'] = imports
item_dict['size'] = '500'
child.append(dict(item_dict))
container['name'] = name
container['children'] = child
if __name__ == '__main__':
raw_data = json.load(open('data/bricsinvestorsfirst.json'))
run(raw_data)
def run(raw_data):
raw_data2 = raw_data[0]
the_output = reformat(raw_data2)
My issue is, the code isn't going through the whole file. It's only outputting one entry. Why is this? Am I rewriting something and do I need another dict that appends with every loop?
Also, it seems as though the for loop is going through the iteritems for each dict key. Is there a way to make it pass only once?
The issue is indeed
raw_data2 = raw_data[0]
I ended up creating an iterator to access the dict values.
Thanks.
Lastly, I'm hoping my final Json file looks this way, using the data I provided above:
{'name': u'name', 'children': [{'name': u'500 Startups', 'size': '500'}, {'name': u'AffinityChina', 'size': '500'}]}
Try this. Though your sample input and output data don't really give many clues as to where the "name" fields should come from. I've assumed you wanted the name of the original item in your list.
original_json = json.load(open('data/bricsinvestorsfirst.json'),'r')
response_json = {}
response_json["name"] = "analytics"
# where your children list will go
children = []
size = 500 # or whatever else you want
# For each item in your original list
for item in original_json:
children.append({"name" : item["name"],
"size" : size})
response_json["children"] = children
print json.dumps(response_json,indent=2)
"It's only outputting one entry" because you only select the first dictionary in the JSON file when you say raw_data2 = raw_data[0]
Try something like this as a starting point (I haven't tested/ran it):
import json
def run():
with open('data/bricsinvestorsfirst.json') as input_file:
raw_data = json.load(input_file)
children = []
for item in raw_data:
children.append({
'name': item['name'],
'size': '500'
})
container = {}
container['name'] = 'name'
container['children'] = children
return json.dumps(container)
if __name__ == '__main__':
print run()

Python json dumps syntax error when appending list of dict

I got two functions that return a list of dictionary and i'm trying to get json to encode it, it works when i try doing it with my first function, but now i'm appending second function with a syntax error of ": expected". I will eventually be appending total of 7 functions that each output a list of dict. Is there a better way of accomplishing this?
import dmidecode
import simplejson as json
def get_bios_specs():
BIOSdict = {}
BIOSlist = []
for v in dmidecode.bios().values():
if type(v) == dict and v['dmi_type'] == 0:
BIOSdict["Name"] = str((v['data']['Vendor']))
BIOSdict["Description"] = str((v['data']['Vendor']))
BIOSdict["BuildNumber"] = str((v['data']['Version']))
BIOSdict["SoftwareElementID"] = str((v['data']['BIOS Revision']))
BIOSdict["primaryBIOS"] = "True"
BIOSlist.append(BIOSdict)
return BIOSlist
def get_board_specs():
MOBOdict = {}
MOBOlist = []
for v in dmidecode.baseboard().values():
if type(v) == dict and v['dmi_type'] == 2:
MOBOdict["Manufacturer"] = str(v['data']['Manufacturer'])
MOBOdict["Model"] = str(v['data']['Product Name'])
MOBOlist.append(MOBOdict)
return MOBOlist
def get_json_dumps():
jsonOBJ = json
#Syntax error is here, i can't use comma to continue adding more, nor + to append.
return jsonOBJ.dumps({'HardwareSpec':{'BIOS': get_bios_specs()},{'Motherboard': get_board_specs()}})
Use multiple items within your nested dictionary.
jsonOBJ.dumps({
'HardwareSpec': {
'BIOS': get_bios_specs(),
'Motherboard': get_board_specs()
}
})
And if you want multiple BIOS items or Motherboard items, just use a list.
...
'HardwareSpec': {
'BIOS': [
get_bios_specs(),
get_uefi_specs()
]
...
}
If you want a more convenient lookup of specs, you can just embed a dict:
jsonOBJ.dumps({'HardwareSpec':{'BIOS': get_bios_specs(),
'Motherboard': get_board_specs()
}
})

Categories

Resources