I am learning python and coding. I am trying one web scraping example. I download the currency exchange data from a website and I want to compute average exchange rate for each currency over a 50 days period. The problem is that I am unable to do the following.
I get results from first function which should be in form of a dictionary and then pass these dictionaries to another function as argument and to perform averaging of those values. I am unable to pass correctly dict values to another function.
my code is as follow
import os
import webbrowser
import requests as rq
import sys
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET
def saveData(path, date):
session = rq.session()
url = 'https://www.bnm.md/en/official_exchange_rates?get_xml=1&date=' + date
datastore = session.get(url)
with open(path, 'wb') as f:
f.write(datastore.content)
data = ET.fromstring(datastore.content)
'''
elements = {}
for element in data.iter():
if element.tag in ('Name', 'Value'):
elements[element.tag] = element.text
print 'elements:', elements
# Here I want to combine those all dictionaries in variable so that i can pass it as argument to another function
return elements
'''
# i replace the above triple quote code with the following below code
elements = {}
for tag, text in data.items():
if tag in ('Name', 'Value'):
elements.setdefault(tag, [])
elements[tag].append(text)
return elements
def computeAverage(elements): # I want to pass function saveData() results who are in dictioanry form to this function but I am unable to solve this issue.
print elements
def main():
dates = ['20.04.2016', '21.04.2016', '22.04.2016']
paths = []
for date in dates:
path = '/home/robbin/Desktop/webscrape/{}.xml'.format(date)
paths.append(path)
data3 = {}
for path, date in zip(paths, dates):
data2 = saveData(path, date)
print 'data2: ', data2
for k, v in data2.items():
data3.setdefault(k, [])
data3[k].append(v)
print 'data3: ', data3
computeAverage(data3)
if __name__ == '__main__':
main()
Also I am getting the results from saveData() function as dictionaries like this and it repeat every dictionary for the next item too which is wrong.
elements: {'Name': 'Euro'}
elements: {'Name': 'Euro', 'Value': '22.4023'}
elements: {'Name': 'US Dollar', 'Value': '22.4023'}
elements: {'Name': 'US Dollar', 'Value': '19.7707'}
elements: {'Name': 'Russian Ruble', 'Value': '19.7707'}
elements: {'Name': 'Russian Ruble', 'Value': '0.3014'}
elements: {'Name': 'Romanian Leu', 'Value': '0.3014'}
elements: {'Name': 'Romanian Leu', 'Value': '4.9988'}
Also what I tried to get results like this but failed
elements: {'Name': 'Euro', 'Value': '22.4023'}
elements: {'Name': 'US Dollar', 'Value': '19.7707'}
elements: {'Name': 'Russian Ruble', 'Value': '0.3014'}
elements: {'Name': 'Romanian Leu', 'Value': '4.9988'}
Updates:-------------
elements = []
for element in data.iter():
if element.tag in ('Name', 'Value'):
elements.append(element.text)
# print 'elements: ', elements
return elements
and in the main function() i make
for path, date in zip(paths, dates):
data = saveData(path, date)
# print 'data from main: ', data
computeAverage(data)
and the output of "print 'data from main: ', data" looks like this
['Euro', '22.4023', 'US Dollar', '19.7707', 'Russian Ruble', '0.3014', 'Romanian Leu', '4.9988',.........'Special Drawing Rights', '27.8688']
['Euro', '22.4408', 'US Dollar', '19.7421', 'Russian Ruble', '0.3007', 'Romanian Leu', '5.0012',.....'Special Drawing Rights', '27.8606']
I am newbie to coding and if someone help me regarding these two problems. I would be really thankful.
First of all, I agree with #Prakhar Verma.
Second, you didn't mention clearly what you want. But I can assume that you want to merge the data that you got from the 'saveData' function and then calculate average. So, here is the missing code.
data3 = {}
for path, date in zip(paths, dates):
data2 = saveData(path, date)
for k, v in data2.items():
# you can move this line after declaring the data3 dict if keys returned by saveData are fixed i.e. name, value
data3.setdefault(k, [])
data3[k].append(v)
computeAverage(data3)
Update to saveData function:
elements = {}
for tag, text in data.items():
if tag in ('Name', 'Value'):
elements.setdefault(tag, [])
elements[tag].append(text)
===================================================
Update 2:
def saveData(path, date):
#session = rq.session()
url = 'https://www.bnm.md/en/official_exchange_rates?get_xml=1&date=' + date
datastore = rq.get(url)
with open(path, 'wb') as f:
f.write(datastore.content)
data = ET.fromstring(datastore.content)
# i replace the above triple quote code with the following below code
elements = {}
for element in data.iter():
tag = element.tag
text = element.text
if tag in ('Name', 'Value'):
elements.setdefault(tag, [])
elements[tag].append(text)
return elements
def main():
dates = ['20.03.2016', '21.03.2016', '22.03.2016']
paths = []
for date in dates:
#please edit this
path = '{}.xml'.format(date)
paths.append(path)
data3 = {}
for path, date in zip(paths, dates):
data2 = saveData(path, date)
for k, v in data2.items():
data3.setdefault(k, [])
data3[k].append(v)
computeAverage(data3)
The 'saveData' function is returning data but you are not saving it in any variable. So what you need to do is save the data when it's returned from 'saveData' function and then send it as a parameter to 'computeAverage' function.
Please go through the basics of coding and follow any programming tutorial. :)
Related
I'm scraping a website, which returns a dictionary:
person = {'name0':{'first0': 'John', 'last0':'Smith'},
'age0':'10',
'location0':{'city0':'Dublin'}
}
I'm trying to write a function that will return a dictionary {'name':'John', 'age':'10'} when passed the above dictionary.
I want to ideally put a try:... except KeyError around each item since sometimes keys will be missing.
def func(person):
filters = [('age', 'age0'), ('name', ['name0', 'first0'])]
result = {'name': None, 'age': None}
for i in filters:
try:
result[i[0]] = person[i[1]]
except KeyError:
pass
return result
The problem is result[i[0]] = person[i[1]] doesn't work for 'name' since there's two keys that need to be followed sequentially and I don't know how to do that.
I want some way of telling it (in the loop) to go to person['name0']['first0'] (and so on to whatever depth the thing I want is).
I have lots of things to extract, so I'd rather do it in a loop instead of a try..except statement for each variable individually.
In order to follow several key sequentially, you can use get and set the default value to {} (empty dictionary) for the upper levels. Set the default value to None (or whatever suits you) for the last level:
def func(person):
return {'name': person.get('name0', {}).get('first0', None),
'age': person.get('age0', None)}
Best I could manage was using a for loop to iterate through the keys:
person = {'name0':{'first0': 'John', 'last0':'Smith'},
'age0':'10',
'location0':{'city0':'Dublin'}
}
Additionally I used .get(key) rather than try..except as suggested by #wiwi
def func(person):
filters = [('age', ['age0']), ('name', ['name0', 'first0'])]
result = {'name': None, 'age': None}
for filter in filters:
temp = person.copy()
for key in filter[1]:
temp = temp.get(key)
if not temp: # NoneType doesn't have .get method
break
result[filter[0]] = temp
return result
func(person) then returns {'name': 'John', 'age': '10'}.
It handles missing input too:
person2 = {'age0':'10',
'location0':{'city0':'Dublin'}}
func(person2) returns {'name': None, 'age': '10'}
You can put the try...except in another loop, if there's a list of keys instead of a single key:
def getNestedVal(obj, kPath:list, defaultVal=None):
if isinstance(kPath, str) or not hasattr(kPath, '__iter__'):
kPath = [kPath] ## if not iterable, wrap as list
for k in kPath:
try: obj = obj[k]
except: return defaultVal
return obj
def func(person):
filters = [('age', 'age0'), ('name', ['name0', 'first0']),#]
('gender', ['gender0'], 'N/A')] # includes default value
return {k[0]: getNestedVal(person, *k[1:3]) for k in filters}
[I added gender just to demonstrate how defaults can also be specified for missing values.]
With this, func(person) should return
{'age': '10', 'name': 'John', 'gender': 'N/A'}
I also have a flattenObj function, a version of which is defined below:
def flattenDict(orig:dict, kList=[], kSep='_', stripNum=True):
if not isinstance(orig, dict): return [(kList, orig)]
tList = []
for k, v in orig.items():
if isinstance(k, str) and stripNum: k = k.strip('0123456789')
tList += flattenDict(v, kList+[str(k)], None)
if not isinstance(kSep, str): return tList
return {kSep.join(kl): v for kl,v in tList}
[I added stripNum just to get rid of the 0s in your keys...]
flattenDict(person) should return
{'name_first': 'John', 'name_last': 'Smith', 'age': '10', 'location_city': 'Dublin'}
I am struggling with figuring out the best way to loop through a function. The output of this API is a Graph Connection and I am a-little out of my element. I really need to obtain ID's from an api output and have them in a dict or some sort of form that I can pass to another API call.
**** It is important to note that the original output is a graph connection.... print(type(api_response) does show it as a list however, if I do a print(type(api_response[0])) it returns a
This is the original output from the api call:
[{'_from': None, 'to': {'id': '5c9941fcdd2eeb6a6787916e', 'type': 'user'}}, {'_from': None, 'to': {'id': '5cc9055fcc5781152ca6eeb8', 'type': 'user'}}, {'_from': None, 'to': {'id': '5d1cf102c94c052cf1bfb3cc', 'type': 'user'}}]
This is the code that I have up to this point.....
api_response = api_instance.graph_user_group_members_list(group_id, content_type, accept,limit=limit, skip=skip, x_org_id=x_org_id)
def extract_id(result):
result = str(result).split(' ')
for i, r in enumerate(result):
if 'id' in r:
id = (result[i+1].translate(str.maketrans('', '', string.punctuation)))
print( id )
return id
extract_id(api_response)
def extract_id(result):
result = str(result).split(' ')
for i, r in enumerate(result):
if 'id' in r:
id = (result[i+8].translate(str.maketrans('', '', string.punctuation)))
print( id )
return id
extract_id(api_response)
def extract_id(result):
result = str(result).split(' ')
for i, r in enumerate(result):
if 'id' in r:
id = (result[i+15].translate(str.maketrans('', '', string.punctuation)))
print( id )
return id
extract_id(api_response)
I have been able to use a function to extract the ID's but I am doing so through a string. I am in need of a scalable solution that I can use to pass these ID's along to another API call.
I have tried to use a for loop but because it is 1 string and i+1 defines the id's position, it is redundant and just outputs 1 of the id's multiple times.
I am receiving the correct output using each of these functions however, it is not scalable..... and just is not a solution. Please help guide me......
So to solve the response as a string issue I would suggest using python's builtin json module. Specifically, the method .loads() can convert a string to a dict or list of dicts. From there you can iterate over the list or dict and check if the key is equal to 'id'. Here's an example based on what you said the response would look like.
import json
s = "[{'_from': None, 'to': {'id': '5c9941fcdd2eeb6a6787916e', 'type': 'user'}}, {'_from': None, 'to': {'id': '5cc9055fcc5781152ca6eeb8', 'type': 'user'}}, {'_from': None, 'to': {'id': '5d1cf102c94c052cf1bfb3cc', 'type': 'user'}}]"
# json uses double quotes and null; there is probably a better way to do this though
s = s.replace("\'", '\"').replace('None', 'null')
response = json.loads(s) # list of dicts
for d in response:
for key, value in d['to'].items():
if key == 'id':
print(value) # or whatever else you want to do
# 5c9941fcdd2eeb6a6787916e
# 5cc9055fcc5781152ca6eeb8
# 5d1cf102c94c052cf1bfb3cc
I am searching a string of text which contains dictionaries that look like so:
soup_string = """{"loadType":"","shiftId":"ROVR-DUMMY-SHIFTID","carbonFriendly":"no","cost":"£2.00","initialSlotPrice":"","timeSlotISO":"2019-06-13T12:00+01:00/13:00+01:00","isSameDayPremium":"false","stopId":"10446315588190612134701380","availability":"full","slotDiscountedByDP":"false","slotId":"1hr-12-13-20190613","time":"12:00pm - 1:00pm","rawSlotPrice":"","slotDiscounted":"false"},
{"loadType":"","shiftId":"ROVR-DUMMY-SHIFTID","carbonFriendly":"no","cost":"£2.00","initialSlotPrice":"","timeSlotISO":"2019-06-13T12:30+01:00/13:30+01:00","isSameDayPremium":"false","stopId":"10446315588190612134701380","availability":"available","slotDiscountedByDP":"false","slotId":"1hr-12:30-13:30-20190613","time":"12:30pm - 1:30pm","rawSlotPrice":"","slotDiscounted":"false"}"""
I am looking to return the string which follows each key in the 'dictionaries'.
I have decided an appropriate method is to use Regex expressions. I can return each times and costs using
Costs = re.findall(r"\£[0-9]\.[0-9][0-9]", soup_string)
times = re.findall(r'\"(time)\"\:\"(.{14,16})\"\,', soup_string)
Essentially I would like to be able to look for each key in the dictionary, and search for a specific string then return the value.
The end goal is to create a dictionary with the 'Cost', 'Availability' and 'time'.
Full code:
import requests
from bs4 import BeautifulSoup
import json
postcode = "L4 0TH"
ASDA_url = "https://groceries.asda.com/api/user/checkpostcode?postcode="+ postcode + "&requestorigin=gi"
ASDA_url2 = "https://groceries.asda.com/api/slot/view?startdate=12%2F06%2F2019&deliveryoption=homedelivery&requestorigin=gi&_="
client = requests.Session()
r = client.get(ASDA_url)
r2 = client.get(ASDA_url2)
soup = BeautifulSoup(r2.text, 'html.parser')
soup_string = str(soup)
soup_dicts = json.loads('[' + soup_string + ']')
keep_keys = ('cost', 'availability', 'time')
filtered = [{k:soup_dict[k] for k in keep_keys} for soup_dict in soup_dicts]```
Given that you have multiple dictionaries, I'm not exactly sure what you're trying to obtain, but from my understanding this should help:
import json
soup_string = ''' ... ''' # As it is in the question
soup_dicts = json.loads('[' + soup_string + ']')
keep_keys = ('cost', 'availability', 'time')
filtered = [{k:soup_dict[k] for k in keep_keys} for soup_dict in soup_dicts]
It treats your string of dictionaries as a list of JSON dictionaries, and uses the json module to parse it. Then it filters out everything except the key/value pairs you need. The result is a list of the filtered dictionaries.
Output (i.e. value of filtered):
[
{'cost': '£2.00', 'availability': 'full', 'time': '12:00pm - 1:00pm'},
{'cost': '£2.00', 'availability': 'available', 'time': '12:30pm - 1:30pm'}
]
EDIT:
In response to you providing your code, I can see that you're calling str on the results from BeautifulSoup. Rather than doing that, you can just process the client.get() results directly:
import json
import requests
postcode = "L4 0TH"
ASDA_url = "https://groceries.asda.com/api/user/checkpostcode?postcode="+ postcode + "&requestorigin=gi"
ASDA_url2 = "https://groceries.asda.com/api/slot/view?startdate=12%2F06%2F2019&deliveryoption=homedelivery&requestorigin=gi&_="
client = requests.Session()
r = client.get(ASDA_url)
r2 = client.get(ASDA_url2)
dicts = r2.json()['slotHeader'][0]['slots']
keep_keys = ('cost', 'availability', 'time')
filtered = [{k:d[k] for k in keep_keys} for d in dicts]
First you need to put your data into a list and create a dictionary with key: data. (see my example below). Then use json to convert it as a dictionary of dictionaries. Then extract cost, availability and time per dictionary on a loop.
import json
soup_string = """{"data": [{"loadType":"","shiftId":"ROVR-DUMMY-SHIFTID","carbonFriendly":"no","cost":"£2.00","initialSlotPrice":"","timeSlotISO":"2019-06-13T12:00+01:00/13:00+01:00","isSameDayPremium":"false","stopId":"10446315588190612134701380","availability":"full","slotDiscountedByDP":"false","slotId":"1hr-12-13-20190613","time":"12:00pm - 1:00pm","rawSlotPrice":"","slotDiscounted":"false"}, {"loadType":"","shiftId":"ROVR-DUMMY-SHIFTID","carbonFriendly":"no","cost":"£2.00","initialSlotPrice":"","timeSlotISO":"2019-06-13T12:30+01:00/13:30+01:00","isSameDayPremium":"false","stopId":"10446315588190612134701380","availability":"available","slotDiscountedByDP":"false","slotId":"1hr-12:30-13:30-20190613","time":"12:30pm - 1:30pm","rawSlotPrice":"","slotDiscounted":"false"}]}"""
d = json.loads(soup_string)
result = []
cost, avail, time = [], [], []
for data in d['data']:
tmp = {}
tmp['Cost'] = data['cost']
tmp['Availability'] = data['availability']
tmp['Time'] = data['time']
result.append(tmp)
result
Output:
[{'Cost': '£2.00', 'Availability': 'full', 'Time': '12:00pm - 1:00pm'},
{'Cost': '£2.00', 'Availability': 'available', 'Time': '12:30pm - 1:30pm'}]
I am currently having trouble parsing a deeply nested JSON response from a HTTP API call.
My JSON Response is like
{'took': 476,
'_revision': 'r08badf3',
'response': {'accounts': {'hits': [{'name': '4002238760',
'display_name': 'Googleglass-4002238760',
'selected_fields': ['Googleglass',
'DDMonkey',
'Papu New Guinea',
'Jonathan Vardharajan',
'4002238760',
'DDMadarchod-INSTE',
None,
'Googleglass',
'0001012556',
'CC',
'Setu Non Standard',
'40022387',
320142,
4651321321333,
1324650651651]},
{'name': '4003893720',
'display_name': 'Swift-4003893720',
'selected_fields': ['Swift',
'DDMonkey',
'Papu New Guinea',
'Jonathan Vardharajan',
'4003893720',
'DDMadarchod-UPTM-RemotexNBD',
None,
'S.W.I.F.T. SCRL',
'0001000110',
'SE',
'Setu Non Standard',
'40038937',
189508,
1464739200000,
1559260800000]},
After I receive the response I am storing it in data object using json normalize
data = response.json()
data = data['response']['accounts']['hits']
data = json_normalize(data)
However after I normalize my dataframe looks like this
My Curl Statement looks like this
curl --data 'query= {"terms":[{"type":"string_attribute","attribute":"Account Type","query_term_id":"account_type","in_list":["Contract"]},{"type":"string","term":"status_group","in_list":["paying"]},{"type":"string_attribute","attribute":"Region","in_list":["DDEU"]},{"type":"string_attribute","attribute":"Country","in_list":["Belgium"]},{"type":"string_attribute","attribute":"CSM Tag","in_list":["EU CSM"]},{"type":"date_attribute","attribute":"Contract Renewal Date","gte":1554057000000,"lte":1561833000000}],"count":1000,"offset":0,"fields":[{"type":"string_attribute","attribute":"DomainName","field_display_name":"Client Name"},{"type":"string_attribute","attribute":"Region","field_display_name":"Region"},{"type":"string_attribute","attribute":"Country","field_display_name":"Country"},{"type":"string_attribute","attribute":"Success Manager","field_display_name":"Client Success Manager"},{"type":"string","term":"identifier","field_display_name":"Account id"},{"type":"string_attribute","attribute":"DeviceSLA","field_display_name":"[FIN] Material Part Number"},{"type":"string_attribute","attribute":"SFDCAccountId","field_display_name":"SFDCAccountId"},{"type":"string_attribute","attribute":"Client","field_display_name":"[FIN] Client Sold-To Name"},{"type":"string_attribute","attribute":"Sold To Code","field_display_name":"[FIN] Client Sold To Code"},{"type":"string_attribute","attribute":"BU","field_display_name":"[FIN] Active BUs"},{"type":"string_attribute","attribute":"Service Type","field_display_name":"[FIN] Service Type"},{"type":"string_attribute","attribute":"Contract Header ID","field_display_name":"[FIN] SAP Contract Header ID"},{"type":"number_attribute","attribute":"Contract Value","field_display_name":"[FIN] ACV - Annual Contract Value","desc":true},{"type":"date_attribute","attribute":"Contract Start Date","field_display_name":"[FIN] Contract Start Date"},{"type":"date_attribute","attribute":"Contract Renewal Date","field_display_name":"[FIN] Contract Renewal Date"}],"scope":"all"}' --header 'app-token:YOUR-TOKEN-HERE' 'https://app.totango.com/api/v1/search/accounts'
So ultimately I want to store the Response in a dataframe along with the field names.
I've had to do this sort of thing a few times in the past (flatten out a nested json) I'll explain my process, and you can see if it works, or at least can then work the code a bit to fit your needs.
1) Took the data response, and completely flattened it out using a function. This blog was very helpful when I first had to do this.
2) Then it iterates through the flat dictionary created to find where each rows and columns are needed to be created by the numbering of the new key names within the nested parts. There are also keys that are unique/distinct, so they don't have a number to identify as a "new" row, so I account for those in what I called special_cols.
3) As it iterates through those, pulls the specified row number (embedded in those flat keys), and then constructs the dataframe in that way.
It sounds complicated, but if you debug and run line by line, you could see how it works. None-the-less, I believe it should get you what you need.
data = {'took': 476,
'_revision': 'r08badf3',
'response': {'accounts': {'hits': [{'name': '4002238760',
'display_name': 'Googleglass-4002238760',
'selected_fields': ['Googleglass',
'DDMonkey',
'Papu New Guinea',
'Jonathan Vardharajan',
'4002238760',
'DDMadarchod-INSTE',
None,
'Googleglass',
'0001012556',
'CC',
'Setu Non Standard',
'40022387',
320142,
4651321321333,
1324650651651]},
{'name': '4003893720',
'display_name': 'Swift-4003893720',
'selected_fields': ['Swift',
'DDMonkey',
'Papu New Guinea',
'Jonathan Vardharajan',
'4003893720',
'DDMadarchod-UPTM-RemotexNBD',
None,
'S.W.I.F.T. SCRL',
'0001000110',
'SE',
'Setu Non Standard',
'40038937',
189508,
1464739200000,
1559260800000]}]}}}
import pandas as pd
import re
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
flat = flatten_json(data)
results = pd.DataFrame()
special_cols = []
columns_list = list(flat.keys())
for item in columns_list:
try:
row_idx = re.findall(r'\_(\d+)\_', item )[0]
except:
special_cols.append(item)
continue
column = re.findall(r'\_\d+\_(.*)', item )[0]
column = column.replace('_', '')
row_idx = int(row_idx)
value = flat[item]
results.loc[row_idx, column] = value
for item in special_cols:
results[item] = flat[item]
Output:
print (results.to_string())
name displayname selectedfields0 selectedfields1 selectedfields2 selectedfields3 selectedfields4 selectedfields5 selectedfields6 selectedfields7 selectedfields8 selectedfields9 selectedfields10 selectedfields11 selectedfields12 selectedfields13 selectedfields14 took _revision
0 4002238760 Googleglass-4002238760 Googleglass DDMonkey Papu New Guinea Jonathan Vardharajan 4002238760 DDMadarchod-INSTE NaN Googleglass 0001012556 CC Setu Non Standard 40022387 320142.0 4.651321e+12 1.324651e+12 476 r08badf3
1 4003893720 Swift-4003893720 Swift DDMonkey Papu New Guinea Jonathan Vardharajan 4003893720 DDMadarchod-UPTM-RemotexNBD NaN S.W.I.F.T. SCRL 0001000110 SE Setu Non Standard 40038937 189508.0 1.464739e+12 1.559261e+12 476 r08badf3
I'm wondering if anyone has a sort of hacky / cool solution to this problem . I have a text file like so:
NAME:name
ID:id
PERSON:person
LOCATION:location
NAME:name
morenamestuff
ID:id
PERSON:person
LOCATION:location
JUNK
So I have some blocks that all contain lines that can be split into a dict, and some that cannot. How can I take lines without the : character and join them to the previous line? Here's what I'm currently doing
# loop through chunk
# the first element of dat is a Title, so skip that
key_map = dict(x.split(':') for x in dat[1:])
But I of course get an error because the second chunk has a line without the : character. So I wanted my dict to look something like this after correctly splitting it:
# there will be a key_map for each chunk of data
key_map['NAME'] == 'name morenamestuff' # 3rd line appended to previous
key_map['ID'] == 'id'
key_map['PERSON'] = 'person'
key_map['LOCATION'] = 'location
Solution
EDIT: Here's my final solution on github, and the full code here:
parseScript.py
import re
import string
bad_chars = '(){}"<>[] ' # characers we want to strip from the string
key_map = []
# parse file
with open("dat.txt") as f:
data = f.read()
data = data.strip('\n')
data = re.split('}|\[{', data)
# format file
with open("format.dat") as f:
formatData = [x.strip('\n') for x in f.readlines()]
data = filter(len, data)
# strip and split each station
for dat in data[1:-1]:
# perform black magic, don't even try to understand this
dat = dat.translate(string.maketrans("", "", ), bad_chars).split(',')
key_map.append(dict(x.split(':') for x in dat if ':' in x ))
if ':' not in dat[1]:key_map['NAME']+=dat[k][2]
for station in range(0, len(key_map)):
for opt in formatData:
print opt,":",key_map[station][opt]
print ""
dat.txt
View raw here
format.dat
NAME
STID
LONGITUDE
LATITUDE
ELEVATION
STATE
ID
out.dat
View raw here
When in doubt, write your own generator.
Add in itertools.groupby to chunk by groups of text delimited by whitespace breaks.
def chunker(s):
it = iter(s)
out = [next(it)]
for line in it:
if ':' in line or not line:
yield ' '.join(out)
out = []
out.append(line)
if out:
yield ' '.join(out)
usage:
from itertools import groupby
[dict(x.split(':') for x in g) for k,g in groupby(chunker(lines), bool) if k]
Out[65]:
[{'ID': 'id', 'LOCATION': 'location', 'NAME': 'name', 'PERSON': 'person'},
{'ID': 'id',
'LOCATION': 'location',
'NAME': 'name morenamestuff',
'PERSON': 'person'}]
(if those fields are always the same, I'd go with something like creating some namedtuples instead of a bunch of dicts)
from collections import namedtuple
Thing = namedtuple('Thing', 'ID LOCATION NAME PERSON')
[Thing(**dict(x.split(':') for x in g)) for k,g in groupby(chunker(lines), bool) if k]
Out[76]:
[Thing(ID='id', LOCATION='location', NAME='name', PERSON='person'),
Thing(ID='id', LOCATION='location', NAME='name morenamestuff', PERSON='person')]
Here is something that addresses all your requirements. It handles joining of multiple lines, ignoring blank lines, and ignoring junk lines that do not appear within a block. It is implemented as a generator that yields each dictionary as it is completed.
def parser(data):
d = {}
for line in data:
line = line.strip()
if not line:
if d:
yield d
d = {}
else:
if ':' in line:
key, value = line.split(':')
d[key] = value
else:
if d:
d[key] = '{} {}'.format(d[key], line)
if d:
yield d
When run with this data:
ignore me
NAME:name1
ID:id1
PERSON:person1
LOCATION:location1
NAME:name2
morenamestuff
ID:id2
PERSON:person2
LOCATION:location2
junk
and
other
stuff
NAME:name3
morenamestuff
and more
ID:id3
PERSON:person3
more person stuff
LOCATION:location3
JUNK
MORE JUNK
>>> for d in parser(open('data')):
... print d
{'PERSON': 'person1', 'LOCATION': 'location1', 'NAME': 'name1', 'ID': 'id1'}
{'PERSON': 'person2', 'LOCATION': 'location2', 'NAME': 'name2 morenamestuff', 'ID': 'id2'}
{'PERSON': 'person3 more person stuff', 'LOCATION': 'location3', 'NAME': 'name3 morenamestuff and more', 'ID': 'id3'}
You can grab the lot as a list:
>>> results = list(parser(open('data')))
>>> results
[{'PERSON': 'person1', 'LOCATION': 'location1', 'NAME': 'name1', 'ID': 'id1'}, {'PERSON': 'person2', 'LOCATION': 'location2', 'NAME': 'name2 morenamestuff', 'ID': 'id2'}, {'PERSON': 'person3 more person stuff', 'LOCATION': 'location3', 'NAME': 'name3 morenamestuff and more', 'ID': 'id3'}]
I don't find itertools or regex particularly nice to work with, here's a pure-python solution
separator = ':'
output = []
chunk = None
with open('/tmp/stuff.txt') as f:
for line in (x.strip() for x in f):
if not line:
# we are between 'chunks'
chunk, key = None, None
continue
if chunk is None:
# we are at the beginning of a new 'chunk'
chunk, key = {}, None
output.append(chunk)
if separator in line:
key, val = line.split(separator)
chunk[key] = val
else:
chunk[key] += line
not as elegant, as you requested, but this works
dat=[['NAME:name',
'ID:id',
'PERSON:person',
'LOCATION:location'],
['NAME:name',
'morenamestuff',
'ID:id',
'PERSON:person',
'LOCATION:location']]
k=1
key_map = dict(x.split(':') for x in dat[k] if ':' in x )
if ':' not in dat[k][1]:key_map['NAME']+=dat[k][1]
key_map>>
{'ID': 'id',
'LOCATION': 'location',
'NAME': 'namemorenamestuff',
'PERSON': 'person'}
Just add something to lines with no ":".
if line.find(':') == -1:
line=line+':None'
Then you won't get an error.