How to use python multiprocessing to get a global sum after traversing through http requests? - python

I'm trying to write an algorithm that traverses the entire collection of nodes and returns the sum of their rewards. Each reward should only be counted a single time. The input to the algorithm will be a URL for a node to begin with, such as http://fake.url/a.
Each get request of the URL will return a JSON like this:
{
"children":[
"http://fake.url/b",
"http://fake.url/c"
],
"reward":1
}
Here's what I've tried:
import multiprocessing
import requests
import json
my_q = multiprocessing.Queue()
my_list =['http://fake.url/']
reward_sum = 0
def enqueue(q):
for data in my_list:
q.put(data)
def get_it(q):
while not q.empty():
item = q.get()
print(item)
response = requests.get(item)
kids = json.loads(response.content)
print(f'URL: {item} --> {kids["reward"]}')
for kid in kids['children']:
print(kid)
q.put(kid)
p1 = multiprocessing.Process(target=enqueue, args=(my_q,))
p2 = multiprocessing.Process(target=get_it, args=(my_q,))
p1.start()
p2.start()
p1.join()
p2.join()
What works above:
I am using multiprocessing.
I am accessing the children and rewards correctly.
I am getting output like this:
http://fake.url/a
URL: http://fake.url/a --> 1
{'children': ['http://fake.url/b', 'http://fake.url/c'], 'reward': 1}
http://fake.url/b
http://fake.url/c
http://fake.url/b
URL: http://fake.url/b --> 2
{'children': ['http://fake.url/d', 'http://fake.url/e'], 'reward': 2}
http://fake.url/d
http://fake.url/e
http://fake.url/c
URL: http://fake.url/c --> 3
{'children': ['http://fake.url/f', 'http://fake.url/g'], 'reward': 3}
http://fake.url/f
http://fake.url/g
http://fake.url/d
URL: http://fake.url/d --> 4
{'reward': 4}
http://fake.url/e
URL: http://fake.url/e --> 5
{'reward': 5}
http://fake.url/f
URL: http://fake.url/f --> 6
{'children': ['http://fake.url/h'], 'reward': 6}
http://fake.url/h
http://fake.url/g
What are the problems I need help with:
How to keep track of the total reward sum in a global variable?
How to keep track of a global "seen" set, so I don't add duplicates to the total reward sum?

def get_it(q):
rewards_total = 0
seen = set()
while not q.empty():
item = q.get()
print(item)
if item in seen:
continue
seen.add(item)
response = requests.get(item)
kids = json.loads(response.content)
rewards_total += kids["reward"]
print(f'URL: {item} --> {kids["reward"]}')
for kid in kids['children']:
print(kid)
q.put(kid)
return rewards_total

Related

Multiprocessing and relationship traversal?

I am trying to implement multiprocessing to speed up traversing a relationship graph. I want to capture items that have a total less than 1000. If the parent is over 1000, process the children until there's no more to check.
I've mocked up an illustration that shows that ThreadPoolExecutor only processes the initial items provided to the class while the class.search_queue_list list is still populated. I also tried using a Queue instead of a list with similar results. Synchronous processing works as expected for list and Queue. Is there a way to make multiprocessing work here when the initial array of items can change?
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
from time import sleep
dummy_data = {
'id1': {'total': 1001, 'children': ['id101','id102']}, # over 1000, children will be processed
'id2': {'total': 999, 'children': ['id201','id202']}, # under 1000, children won't be processed
'id101': {'total': 501, 'children': ['more_children']},
'id102': {'total': 500, 'children': ['more_children']},
'id201': {'total': 499,'children': ['more_children']},
'id202': {'total': 500, 'children': ['more_children']},
}
class SearchDummy(object):
def __init__(self, start_list):
# with list
self.search_queue_list = start_list
# with Queue
self.search_queue_queue = Queue()
for item in self.search_queue_list:
self.search_queue_queue.put(item)
self.good_ids = []
def get_total(self, search_id):
# artificial delay
sleep(0.5)
return dummy_data[search_id]['total']
def get_children(self, search_id):
# artificial delay
sleep(0.5)
return dummy_data[search_id]['children']
# START LIST
def current_check_list(self):
# get first element in search_queue_list
current_id = self.search_queue_list.pop(0)
# check if current_id['total'] is over 1000
if self.get_total(current_id) <= 1000:
self.good_ids.append(current_id)
else:
# prepend children to search_queue_list
self.search_queue_list.extend(self.get_children(current_id))
def search_list(self):
while self.search_queue_list:
self.current_check_list()
def multi_search_list(self):
with ThreadPoolExecutor() as e:
while self.search_queue_list:
e.submit(self.current_check_list)
# END LIST
# START QUEUE
def current_check_queue(self):
# get item from search_queue_queue
current_id = self.search_queue_queue.get()
# check if current_id['total'] is over 1000
if self.get_total(current_id) <= 1000:
self.good_ids.append(current_id)
else:
# put children in search_queue_queue
for child in self.get_children(current_id):
self.search_queue_queue.put(child)
def search_queue(self):
while not self.search_queue_queue.empty():
self.current_check_queue()
def multi_search_queue(self):
with ThreadPoolExecutor() as e:
while not self.search_queue_queue.empty():
e.submit(self.current_check_queue)
# END QUEUE
# synchronous list
s = SearchDummy(['id1','id2'])
s.search_list()
print('List output', s.good_ids) # returns ['id101', 'id102', 'id2']
print('Remaining list size', len(s.search_queue_list)) # returns 0
# synchronous queue
s = SearchDummy(['id1','id2'])
s.search_queue()
print('Queue output', s.good_ids) # returns ['id101', 'id102', 'id2']
print('Remaining queue size', s.search_queue_queue.qsize()) # returns 0
# multiprocessing list
s = SearchDummy(['id1','id2'])
s.multi_search_list()
print('Multi list output', s.good_ids) # returns ['id2']
print('Multi list remaining', s.search_queue_list) # returns ['id101', 'id102']
# multiprocessing queue
s = SearchDummy(['id1','id2'])
s.multi_search_queue()
print('Multi queue output', s.good_ids) # returns ['id2']
print('Multi queue remaining', list(s.search_queue_queue.queue)) # returns ['id101', 'id102']

How to loop through values in JSON and assign to another dictionary

I am developing a Python/Django web app. I am trying to parse JSON into a python dictionary, read the values in the dictionary, and assign the values to another dictionary if certain conditions are met.
JSON is structured like this:
{content: {cars: [0, 1, 2]}, other_stuff: []}
Each car has multiple attributes:
0: {"make", "model", "power"...}
Each attribute has three variables:
make: {"value": "Toyota", "unit": "", "user_edited": "false"}
I am trying to assign the values in the JSON to other dictionaries; car_0, car_1 and car_2. In this case the JSON response is otherwise identical considering each car, but the 'make' of the first car is changed to 'Nissan', and I'm trying to then change the make of the car_0 also to 'Nissan'. I'm parsing JSON in the following way:
local_cars = [car_0, car_1, car_2] # Dictionaries which are already initialized.
print(local_cars[0] is local_cars[1]) # Prints: false
print(local_cars[0]['make']['value']) # Prints: Toyota (yes)
print(local_cars[1]['make']['value']) # Prints: Toyota (yes)
print(local_cars[2]['make']['value']) # Prints: Toyota (yes)
counter = 0
if request.method == 'POST':
payload = json.loads(request.body)
if bool(payload):
print(len(local_cars)) # Prints: 3
print(counter, payload['cars'][0]['make']['value']) # Prints: Nissan (yes)
print(counter, payload['cars'][1]['make']['value']) # Prints: Toyota (yes)
print(counter, payload['cars'][2]['make']['value']) # Prints: Toyota (yes)
print(counter, local_cars[0]['make']['value']) # Prints: Toyota (yes)
print(counter, local_cars[1]['make']['value']) # Prints: Toyota (yes)
print(counter, local_cars[2]['make']['value']) # Prints: Toyota (yes)
for target_car in payload['cars']: # Loop through all three cars in payload
print(local_cars[0] is local_cars[1]) # false
for attr in target_car.items(): # Loop through all key:dict pairs of a single car
attribute_key = attr[0] # Key (eg. 'make')
vars_dict = attr[1] # Dictionary of variables ('value': 'xx', 'unit': 'yy', 'user_edited': 'zz')
if vars_dict['user_edited'] == 'true':
local_cars[counter][attribute_key]['user_edited'] = 'true'
local_cars[counter][attribute_key]['value'] = vars_dict['value']
print(counter, local_cars[counter]['make']['value']) # Prints: 0, Toyota (yes), 1, Nissan (no!), 2, Nissan (no!)
counter = counter + 1
What I don't understand is why the other cars, local_cars[1] and local_cars[2] are affected in anyway in this loop. As it can be seen, for some reason their 'make' is changed to 'Nissan' even though it was 'Toyota' in the request body. This seems to happen in the first round of 'for target_car in payload['cars'].
Abandoning the loop/counter and focusing on one car does not make any difference:
for target_car in payload['cars']: --> target_car = payload['cars'][0]:
...
local_cars[0][attribute_key]['user_edited'] = 'true'
local_cars[0][attribute_key]['value'] = vars_dict['value']
What am I doing wrong? How can the car_1 and car_2 be affected even if I change the only part of the code where any values in those dictionaries are edited to affect only on the local_cars[0]?
UPDATED
Received the correct answer for this. Before the part of code originally posted, I initialized the car_0, car_1 and car_2 dictionaries.
What I did before was:
default_car = model_to_dict(Car.objects.first())
car_0 = {}
car_1 = {}
car_2 = {}
attribute = {}
i = 0
for key, value in default_car.items():
if i > 1:
attribute[key] = {"value": value, "unit": units.get(key), "user_edited": "false"}
i = i + 1
car_0.update(attribute)
car_1.update(attribute)
car_2.update(attribute)
local_cars = [car_0, car_1, car_2]
...
Apparently it was the problem that all car_x had a connection to attribute-dictionary. I solved the problem by editing the car_x initialization to the following:
default_car = model_to_dict(Car.objects.first())
car_0 = {}
car_1 = {}
car_2 = {}
attribute_0 = {}
attribute_1 = {}
attribute_2 = {}
i = 0
for key, value in default_car.items():
if i > 1:
attribute_0[key] = {"value": value, "unit": units.get(key), "user_edited": "false"}
attribute_1[key] = {"value": value, "unit": units.get(key), "user_edited": "false"}
attribute_2[key] = {"value": value, "unit": units.get(key), "user_edited": "false"}
i = i + 1
car_0.update(attribute_0)
car_1.update(attribute_1)
car_2.update(attribute_2)
local_cars = [car_0, car_1, car_2]
...
I think you are probably failing to take copies of car_0 etc. Don't forget that python assignment is purely name-binding.
x = car_0
y = car_0
print( x['make']['value'] ) # 'Toyota'
print( y['make']['value'] ) # 'Toyota'
print( x is y ) # True. Both names refer to the same object
x['make']['value'] = 'foo'
print( y['make']['value'] ) # 'foo'
Should have been y = car_0.copy() or even y=car_0.deepcopy().
I don't fully follow your code, but if you are still unsure then do some is testing to find out which entities are bound to the same object (and shouldn't be).

How to read the next page on API using python iterator?

There is an API that only produces one hundred results per page. I am trying to make a while loop so that it goes through all pages and takes results from all pages, but it does not work. I would be grateful if you could help me figure it out.
params = dict(
order_by='salary_desc',
text=keyword,
area=area,
period=30, # days
per_page=100,
page = 0,
no_magic='false', # disable magic
search_field='name' # available: name, description, company_name
)
response = requests.get(
BASE_URL + '/vacancies',
headers={'User-Agent': generate_user_agent()},
params=params,
)
response
items = response.json()['items']
vacancies = []
for item in items:
vacancies.append(dict(
id=item['id'],
name=item['name'],
salary_from=item['salary']['from'] if item['salary'] else None,
salary_to=item['salary']['to'] if item['salary'] else None,
currency = item['salary']['currency'] if item['salary'] else None,
created=item['published_at'],
company=item['employer']['name'],
area = item['area']['name'],
url=item['alternate_url']
))
I loop through the dictionary, if there is a result in the dictionary, I add +1 to the page parameter as an iterator:
while vacancies == True:
params['page'] += 1
Result in dictionary params ['page'] = zero remains (pages in API start at zero).
When calling params after starting the loop, the result is:
{'area': 1,
'no_magic': 'false',
'order_by': 'salary_desc',
'page': 0,
'per_page': 100,
'period': 30,
'search_field': 'name',
'text': '"python"'}
Perhaps I am doing the loop incorrectly, starting from the logic that while there is a result in the dictionary, the loop must be executed.
while vacancies == True: #
params['page'] += 1
will never evaluate to literal True regardless of it's contents. Python dict's; even thought they are Truthy They aren't True. You need to lessen the strictness of the statement.
if vacancies: # is truthy if it's len > 0, falsey otherwise
# Do something
Or you can explicitly check that it has content
if len(vacancies) > 0:
# Do something
This solves the problem of how to evaluate based on an object but doesn't solve the overall logic problem.
for _ in vacancies:
params["page"] += 1
# Does something for every item in vacancies
What you do each loop will depend on the problem and will require another question!
fixed below
params = dict(
order_by='salary_desc',
text=keyword,
area=area,
period=30, # days
per_page=100,
page = 0,
no_magic='false', # disable magic
search_field='name' # available: name, description, company_name
)
pages = []
while True:
params["page"] += 1
response = requests.get(BASE_URL + '/vacancies', headers={'User-Agent': generate_user_agent()}, params=params,)
items = response.json()['items']
if not items:
break
pages.append(items) # Do it for each page
Make vacancies for each page
results = []
for page in pages:
vacancies = []
for item in page:
vacancies.append(dict(
id=item['id'],
name=item['name'],
salary_from=item['salary']['from'] if item['salary'] else None,
salary_to=item['salary']['to'] if item['salary'] else None,
currency = item['salary']['currency'] if item['salary'] else None,
created=item['published_at'],
company=item['employer']['name'],
area = item['area']['name'],
url=item['alternate_url']
))
results.append(vacancies)
Results will be the fine list of all items.
vacancies is never True.
If you want to test on the boolean value of "vacancies" you could use bool(vacancies).
But with Python, you can use
while vacancies:
# some code logic
This way, Python will auto cast to bool your list.
If your list as something inside (len(your_list) > 0), bool(your_list) evaluatues to True, else it's False.
Also, instead of using dict(), you could write your dict this way:
params = {
'order_by': 'salary_desc',
'text':keyword,
'area': area,
'period': 30, # days
'per_page': 100,
'page': 0,
'no_magic': 'false', # disable magic
'search_field': 'name' # available: name, description, company_name
}
which is more pythonic.

How to scrape and extract all the subcategories names from all its associated pages for a wikipedia category using python 3.6?

I want to scrape all the subcategories and pages under the category header of the Category page: "Category:Computer science". The link for the same is as follows: http://en.wikipedia.org/wiki/Category:Computer_science.
I have got an idea regarding the above mentioned problem, from the following stack overflow answer which is specified in the following link.
Pythonic beautifulSoup4 : How to get remaining titles from the next page link of a wikipedia category
and
How to scrape Subcategories and pages in categories of a Category wikipedia page using Python
However, the answer do not fully solves the problem. It only scrapes the Pages in category "Computer science". But, I want to extract all the subcategories names and its associated pages. I want the process should report the results in BFS manner with a depth of 10. Is there exist any way to do this?
I found the following code from this linked post:
from pprint import pprint
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
base_url = 'https://en.wikipedia.org/wiki/Category:Computer science'
def get_next_link(soup):
return soup.find("a", text="next page")
def extract_links(soup):
return [a['title'] for a in soup.select("#mw-pages li a")]
with requests.Session() as session:
content = session.get(base_url).content
soup = BeautifulSoup(content, 'lxml')
links = extract_links(soup)
next_link = get_next_link(soup)
while next_link is not None: # while there is a Next Page link
url = urljoin(base_url, next_link['href'])
content = session.get(url).content
soup = BeautifulSoup(content, 'lxml')
links += extract_links(soup)
next_link = get_next_link(soup)
pprint(links)
To scrape the subcategories, you will have to use selenium to interact with the dropdowns. A simple traversal over the second category of links will yield the pages, however, to find all the subcategories, recursion is needed to properly group the data. The code below utilizes a simple variant of the breadth-first search to determine when to stop looping over the dropdown toggle objects generated at each iteration of the while loop:
from selenium import webdriver
import time
from bs4 import BeautifulSoup as soup
def block_data(_d):
return {_d.find('h3').text:[[i.a.attrs.get('title'), i.a.attrs.get('href')] for i in _d.find('ul').find_all('li')]}
def get_pages(source:str) -> dict:
return [block_data(i) for i in soup(source, 'html.parser').find('div', {'id':'mw-pages'}).find_all('div', {'class':'mw-category-group'})]
d = webdriver.Chrome('/path/to/chromedriver')
d.get('https://en.wikipedia.org/wiki/Category:Computer_science')
all_pages = get_pages(d.page_source)
_seen_categories = []
def get_categories(source):
return [[i['href'], i.text] for i in soup(source, 'html.parser').find_all('a', {'class':'CategoryTreeLabel'})]
def total_depth(c):
return sum(1 if len(b) ==1 and not b[0] else sum([total_depth(i) for i in b]) for a, b in c.items())
def group_categories(source) -> dict:
return {i.find('div', {'class':'CategoryTreeItem'}).a.text:(lambda x:None if not x else [group_categories(c) for c in x])(i.find_all('div', {'class':'CategoryTreeChildren'})) for i in source.find_all('div', {'class':'CategoryTreeSection'})}
while True:
full_dict = group_categories(soup(d.page_source, 'html.parser'))
flag = False
for i in d.find_elements_by_class_name('CategoryTreeToggle'):
try:
if i.get_attribute('data-ct-title') not in _seen_categories:
i.click()
flag = True
time.sleep(1)
except:
pass
else:
_seen_categories.append(i.get_attribute('data-ct-title'))
if not flag:
break
Output:
all_pages:
[{'\xa0': [['Computer science', '/wiki/Computer_science'], ['Glossary of computer science', '/wiki/Glossary_of_computer_science'], ['Outline of computer science', '/wiki/Outline_of_computer_science']]},
{'B': [['Patrick Baudisch', '/wiki/Patrick_Baudisch'], ['Boolean', '/wiki/Boolean'], ['Business software', '/wiki/Business_software']]},
{'C': [['Nigel A. L. Clarke', '/wiki/Nigel_A._L._Clarke'], ['CLEVER score', '/wiki/CLEVER_score'], ['Computational human modeling', '/wiki/Computational_human_modeling'], ['Computational social choice', '/wiki/Computational_social_choice'], ['Computer engineering', '/wiki/Computer_engineering'], ['Critical code studies', '/wiki/Critical_code_studies']]},
{'I': [['Information and computer science', '/wiki/Information_and_computer_science'], ['Instance selection', '/wiki/Instance_selection'], ['Internet Research (journal)', '/wiki/Internet_Research_(journal)']]},
{'J': [['Jaro–Winkler distance', '/wiki/Jaro%E2%80%93Winkler_distance'], ['User:JUehV/sandbox', '/wiki/User:JUehV/sandbox']]},
{'K': [['Krauss matching wildcards algorithm', '/wiki/Krauss_matching_wildcards_algorithm']]},
{'L': [['Lempel-Ziv complexity', '/wiki/Lempel-Ziv_complexity'], ['Literal (computer programming)', '/wiki/Literal_(computer_programming)']]},
{'M': [['Machine learning in bioinformatics', '/wiki/Machine_learning_in_bioinformatics'], ['Matching wildcards', '/wiki/Matching_wildcards'], ['Sidney Michaelson', '/wiki/Sidney_Michaelson']]},
{'N': [['Nuclear computation', '/wiki/Nuclear_computation']]}, {'O': [['OpenCV', '/wiki/OpenCV']]},
{'P': [['Philosophy of computer science', '/wiki/Philosophy_of_computer_science'], ['Prefetching', '/wiki/Prefetching'], ['Programmer', '/wiki/Programmer']]},
{'Q': [['Quaject', '/wiki/Quaject'], ['Quantum image processing', '/wiki/Quantum_image_processing']]},
{'R': [['Reduction Operator', '/wiki/Reduction_Operator']]}, {'S': [['Social cloud computing', '/wiki/Social_cloud_computing'], ['Software', '/wiki/Software'], ['Computer science in sport', '/wiki/Computer_science_in_sport'], ['Supnick matrix', '/wiki/Supnick_matrix'], ['Symbolic execution', '/wiki/Symbolic_execution']]},
{'T': [['Technology transfer in computer science', '/wiki/Technology_transfer_in_computer_science'], ['Trace Cache', '/wiki/Trace_Cache'], ['Transition (computer science)', '/wiki/Transition_(computer_science)']]},
{'V': [['Viola–Jones object detection framework', '/wiki/Viola%E2%80%93Jones_object_detection_framework'], ['Virtual environment', '/wiki/Virtual_environment'], ['Visual computing', '/wiki/Visual_computing']]},
{'W': [['Wiener connector', '/wiki/Wiener_connector']]},
{'Z': [['Wojciech Zaremba', '/wiki/Wojciech_Zaremba']]},
{'Ρ': [['Portal:Computer science', '/wiki/Portal:Computer_science']]}]
full_dict is quite large, and due to its size I am unable to post it entirely here, however, below is an implementation of a function to traverse the structure and select all the elements down to a depth of ten:
def trim_data(d, depth, count):
return {a:None if count == depth else [trim_data(i, depth, count+1) for i in b] for a, b in d.items()}
final_subcategories = trim_data(full_dict, 10, 0)
Edit: script to remove leaves from tree:
def remove_empty_children(d):
return {a:None if len(b) == 1 and not b[0] else
[remove_empty_children(i) for i in b if i] for a, b in d.items()}
When running the above:
c = {'Areas of computer science': [{'Algorithms and data structures': [{'Abstract data types': [{'Priority queues': [{'Heaps (data structures)': [{}]}, {}], 'Heaps (data structures)': [{}]}]}]}]}
d = remove_empty_children(c)
Output:
{'Areas of computer science': [{'Algorithms and data structures': [{'Abstract data types': [{'Priority queues': [{'Heaps (data structures)': None}], 'Heaps (data structures)': None}]}]}]}
Edit 2: flattening the entire structure:
def flatten_groups(d):
for a, b in d.items():
yield a
if b is not None:
for i in map(flatten_groups, b):
yield from i
print(list(flatten_groups(remove_empty_children(c))))
Output:
['Areas of computer science', 'Algorithms and data structures', 'Abstract data types', 'Priority queues', 'Heaps (data structures)', 'Heaps (data structures)']
Edit 3:
To access all the pages for every subcategory to a certain level, the original get_pages function can be utilized and a slightly different version of the group_categories method
def _group_categories(source) -> dict:
return {i.find('div', {'class':'CategoryTreeItem'}).find('a')['href']:(lambda x:None if not x else [group_categories(c) for c in x])(i.find_all('div', {'class':'CategoryTreeChildren'})) for i in source.find_all('div', {'class':'CategoryTreeSection'})}
from collections import namedtuple
page = namedtuple('page', ['pages', 'children'])
def subcategory_pages(d, depth, current = 0):
r = {}
for a, b in d.items():
all_pages_listing = get_pages(requests.get(f'https://en.wikipedia.org{a}').text)
print(f'page number for {a}: {len(all_pages_listing)}')
r[a] = page(all_pages_listing, None if current==depth else [subcategory_pages(i, depth, current+1) for i in b])
return r
print(subcategory_pages(full_dict, 2))
Please note that in order to utilize subcategory_pages, _group_categories must be used in place of group_categories.

How to use "While()" in python [closed]

It's difficult to tell what is being asked here. This question is ambiguous, vague, incomplete, overly broad, or rhetorical and cannot be reasonably answered in its current form. For help clarifying this question so that it can be reopened, visit the help center.
Closed 9 years ago.
Hi all my code below allows me to extract some specific informations from the data and i would like that someone helps me to write this more properly by using a while so i can do this for many lines now i only have two lines ( data ) i'm beginner so if someone can help please explain so i can learn and not just copy and paste =)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
tableau = []
data = "00:02:12.935 mta Messages I Doc O:NVS:FAXG3/ R:NVS:SMTP.0/xxxx#x.fr mid:6499"
result1 = {}
i = re.findall(r"^.[^\ ]*", data )
j = re.findall(r"\d+$", data )
k = re.findall(r"O:[^\ ]*", data )
r = re.findall(r"R:[^\ ]*", data )
result1 = {'Heure':i,'MID':j,'Source':k,'Destination':r}
data = "00:03:12.935 mta Messages I Doc O:NVS:FAXG3/ R:NVS:SMTP.0/xxxxx#xxxxx.fr mid:6599"
result2 = {}
i = re.findall(r"^.[^\ ]*", data )
j = re.findall(r"\d+$", data )
k = re.findall(r"O:[^\ ]*", data )
r = re.findall(r"R:[^\ ]*", data )
result2 = {'Heure':i,'MID':j,'Source':k,'Destination':r}
tableau.append(result1)
tableau.append(result2)
print tableau
This is actually done better with a for loop:
data1 = "00:02:12.935 mta Messages I Doc O:NVS:FAXG3/ R:NVS:SMTP.0/xxxx#x.fr mid:6499"
data2 = "00:03:12.935 mta Messages I Doc O:NVS:FAXG3/ R:NVS:SMTP.0/xxxxx#xxxxx.fr mid:6599"
data_list = [ data1, data2 ] #store the data in a list so we can iterate over it
tableau = [] #create a list to hold our output
for data in data_list: #iterate over the list, getting 1 "data" at a time
#extract info we want
i = re.findall(r"^.[^\ ]*", data )
j = re.findall(r"\d+$", data )
k = re.findall(r"O:[^\ ]*", data )
r = re.findall(r"R:[^\ ]*", data )
#create dictionary and append it to tableau
tableau.append({'Heure':i,'MID':j,'Source':k,'Destination':r})
More advanced users would probably use a function here which takes the string as input and returns a dictionary of the desired data:
def extract(data):
i = re.findall(r"^.[^\ ]*", data )
j = re.findall(r"\d+$", data )
k = re.findall(r"O:[^\ ]*", data )
r = re.findall(r"R:[^\ ]*", data )
return {'Heure':i,'MID':j,'Source':k,'Destination':r}
now you can use this in a list comprehension:
tableau = [extract(data) for data in data_list]
From the comments, it looks like you're getting the lines of data from a file. That's even better (Who wants to type all those strings?). Now we can shorten this to:
with open('filename') as fin:
tableau = [extract(data) for data in fin]
using with introduces another python structure -- (the context manager). That's a little more complex, but it's the prefered way to open a file. for file objects, it's functionally equivalent to:
fin = open('filename')
tableau = ...
fin.close()
Here. This parses your data in a much more efficient method, it uses a function that you can just feed a list of data too. If you want to turn this into a generator it is also very easy.
import re
def parser(data):
result = []
for p in data:
ms = re.match(r'(\S+).*?(O:\S+).*(R:\S+).*mid:(\d+)', p)
if not ms:
continue
result.append({'Heure':ms.group(1), 'Source':ms.group(2), 'Destination':ms.group(3), 'MID':ms.group(4)})
return result
data = ["00:02:12.935 mta Messages I Doc O:NVS:FAXG3/ R:NVS:SMTP.0/xxxx#x.fr mid:6499",
"00:03:12.935 mta Messages I Doc O:NVS:FAXG3/ R:NVS:SMTP.0/xxxxx#xxxxx.fr mid:6599"]
print parser(data)
Results:
>>>
[{'Source': 'O:NVS:FAXG3/', 'Destination': 'R:NVS:SMTP.0/xxxx#x.fr', 'Heure': '00:02:12.935', 'MID': '6499'},
{'Source': 'O:NVS:FAXG3/', 'Destination': 'R:NVS:SMTP.0/xxxxx#xxxxx.fr', 'Heure': '00:03:12.935', 'MID': '6599'}]
As a generator:
import re
def parser(data):
for p in data:
ms = re.match(r'(\S+).*?(O:\S+).*(R:\S+).*mid:(\d+)', p)
if not ms:
continue
yield {'Heure':ms.group(1), 'Source':ms.group(2), 'Destination':ms.group(3), 'MID':ms.group(4)}
data = ["00:02:12.935 mta Messages I Doc O:NVS:FAXG3/ R:NVS:SMTP.0/xxxx#x.fr mid:6499",
"00:03:12.935 mta Messages I Doc O:NVS:FAXG3/ R:NVS:SMTP.0/xxxxx#xxxxx.fr mid:6599"]
for r in parser(data):
print r
Results:
>>>
{'Source': 'O:NVS:FAXG3/', 'Destination': 'R:NVS:SMTP.0/xxxx#x.fr', 'Heure': '00:02:12.935', 'MID': '6499'}
{'Source': 'O:NVS:FAXG3/', 'Destination': 'R:NVS:SMTP.0/xxxxx#xxxxx.fr', 'Heure': '00:03:12.935', 'MID': '6599'}
Using #mgilsons answer idea with my regex:
def extract(data):
ms = re.match(r'(\S+).*?(O:\S+).*(R:\S+).*mid:(\d+)', data)
if not ms:
raise Exception('Could not extract data')
return {'Heure':ms.group(1), 'Source':ms.group(2), 'Destination':ms.group(3), 'MID':ms.group(4)}
tableau = [extract(data) for data in data_list]
i don't think while is the best way to do what you are expecting. maybe you can use
for data in dataArray:
Where the dataArray contain your data string.
Thanks to Wooble for inspiring this While function and example. The idea got me thinking how to do it.
>>> def While(function, *args, **kwargs):
while function(*args, **kwargs): pass
>>> def unstack(array):
print(array.pop())
return array
>>> While(unstack, ['world!', 'there', 'Hello'])
Hello
there
world!
>>> def fib(state):
state.append(sum(state))
print(state.pop(0))
return state[0] < 1000
>>> While(fib, [0, 1])
0
1
1
2
3
5
8
13
21
34
55
89
144
233
377
610
987
>>>
Generators are rather nice too, so a WhileGenerator was created as well to satisfy my curiosity.
>>> def WhileGenerator(function, *args, **kwargs):
iterator = iter(function(*args, **kwargs))
while next(iterator):
yield next(iterator)
>>> import operator, functools, itertools
>>> for value in WhileGenerator(lambda a, b: functools.reduce(operator.add,
itertools.zip_longest(a, b)),
(True, True, True, False),
'Hello there world!'.split()):
print(value)
Hello
there
world!
>>> def fib_gen(state, limit):
while True:
yield state[0] < limit
state.append(sum(state))
yield state.pop(0)
>>> for value in WhileGenerator(fib_gen, [0, 1], 1000):
print(value)
0
1
1
2
3
5
8
13
21
34
55
89
144
233
377
610
987
>>>

Categories

Resources