How to slice a json file, only extract part of the fields

How to slice a json file, only extract part of the fields - python

I am trying to slice a json file, the file looks like this:
{"price": 17.95, "categories": [["Musical Instruments", "Instrument Accessories", "General Accessories", "Sheet Music Folders"]], "imUrl": "http://ecx.images-amazon.com/images/I/41EpRmh8MEL._SY300_.jpg", "title": "Six Sonatas For Two Flutes Or Violins, Volume 2 (#4-6)", "salesRank": {"Musical Instruments": 207315}, "asin": "0006428320"}
{"description": "Composer: J.S. Bach.Peters Edition.For two violins and pianos.", "related": {"also_viewed": ["B0058DK7RA"], "buy_after_viewing": ["B0058DK7RA"]}, "categories": [["Musical Instruments"]], "brand": "", "imUrl": "http://ecx.images-amazon.com/images/I/41m6ygCqc8L._SY300_.jpg", "title": "Double Concerto in D Minor By Johann Sebastian Bach. Edited By David Oistrach. For Violin I, Violin Ii and Piano Accompaniment. Urtext. Baroque. Medium. Set of Performance Parts. Solo Parts, Piano Reduction and Introductory Text. BWV 1043.", "salesRank": {"Musical Instruments": 94593}, "asin": "0014072149", "price": 18.77}
{"asin": "0041291905", "categories": [["Musical Instruments", "Instrument Accessories", "General Accessories", "Sheet Music Folders"]], "imUrl": "http://ecx.images-amazon.com/images/I/41maAqSO9hL._SY300_.jpg", "title": "Hal Leonard Vivaldi Four Seasons for Piano (Original Italian Text)", "salesRank": {"Musical Instruments": 222972}, "description": "Vivaldi's famous set of four violin concertos certainly ranks among the all-time top ten classical favorites. Features include an introduction about the history of The Four Seasons and Vivaldi's original vivid Italian score markings. A must for classical purists."}
You can see the fields is not arrange strictly in all the lines and i only need part of the fields.
so I wrote this code:
import json, csv
infile = open("sample_output.strict", "r")
outfile = open("output.csv", "w")
writer = csv.writer(outfile)
fileds = ["asin","price"]
for product in json.loads(infile.read()):
line = []
for f in fields:
if product.has_key(f):
line.append(product[f])
else:
line.append("")
writer.write(line)
I got below error msg:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-14-3e335b184eea> in <module>()
6
7 fileds = ["asin","price"]
----> 8 for product in json.loads(infile.read()):
9 line = []
10 for f in fields:
C:\Anaconda3\lib\json\__init__.py in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
316 parse_int is None and parse_float is None and
317 parse_constant is None and object_pairs_hook is None and not kw):
--> 318 return _default_decoder.decode(s)
319 if cls is None:
320 cls = JSONDecoder
C:\Anaconda3\lib\json\decoder.py in decode(self, s, _w)
344 end = _w(s, end).end()
345 if end != len(s):
--> 346 raise ValueError(errmsg("Extra data", s, end, len(s)))
347 return obj
348
ValueError: Extra data: line 2 column 1 - line 3 column 617 (char 339 - 1581)

What you have is lines of json, not a single json document. Change your program to read each line and convert it to json, then look in each document that way. This is actually pretty common, I receive data to load all the time that is in this format.
Doing it line by line will save you a lot on memory if you end up dealing with large files anyhow.
import json, csv
with open("sample_output.strict", "r") as infile:
with open("output.csv", "w") as outfile:
writer = csv.writer(outfile)
fields = ["asin","price"]
for json_line in infile:
product = json.loads(json_line)
line = []
for f in fields:
if product.has_key(f):
line.append(product[f])
else:
line.append("")
writer.writerow(line)

Your input json file is ill-formed. That is the reason you are seeing this error. In short, you cannot have multiple JSON "objects" in a single file. However, in your case, there are 3 hashes being seen. One solution for this is to encompass them with a top-level list like this:
[
{"price": 17.95, "categories": [["Musical Instruments", "Instrument Accessories", "General Accessories", "Sheet Music Folders"]], "imUrl": "http://ecx.images-amazon.com/images/I/41EpRmh8MEL._SY300_.jpg", "title": "Six Sonatas For Two Flutes Or Violins, Volume 2 (#4-6)", "salesRank": {"Musical Instruments": 207315}, "asin": "0006428320"},
{"description": "Composer: J.S. Bach.Peters Edition.For two violins and pianos.", "related": {"also_viewed": ["B0058DK7RA"], "buy_after_viewing": ["B0058DK7RA"]}, "categories": [["Musical Instruments"]], "brand": "", "imUrl": "http://ecx.images-amazon.com/images/I/41m6ygCqc8L._SY300_.jpg", "title": "Double Concerto in D Minor By Johann Sebastian Bach. Edited By David Oistrach. For Violin I, Violin Ii and Piano Accompaniment. Urtext. Baroque. Medium. Set of Performance Parts. Solo Parts, Piano Reduction and Introductory Text. BWV 1043.", "salesRank": {"Musical Instruments": 94593}, "asin": "0014072149", "price": 18.77},
{"asin": "0041291905", "categories": [["Musical Instruments", "Instrument Accessories", "General Accessories", "Sheet Music Folders"]], "imUrl": "http://ecx.images-amazon.com/images/I/41maAqSO9hL._SY300_.jpg", "title": "Hal Leonard Vivaldi Four Seasons for Piano (Original Italian Text)", "salesRank": {"Musical Instruments": 222972}, "description": "Vivaldi's famous set of four violin concertos certainly ranks among the all-time top ten classical favorites. Features include an introduction about the history of The Four Seasons and Vivaldi's original vivid Italian score markings. A must for classical purists."}
]
Then you can use the following piece of code to slice:
import json, csv
infile = open("sample_output.strict", "r")
jsondata = json.loads(infile.read())
outfile = open("output.csv", "w")
writer = csv.writer(outfile)
fields = ["asin","price"]
for product in jsondata:
line = []
for f in fields:
if f in product:
line.append(product)
break # I assume you need to print only once per match!?
else:
line.append("")
writer.write(line)
I don't understand what you're trying to do with csv output, so I just copied it as it is, to demonstrate the fix.

Related

Finding positions to "print" in large arrays for Python?

I am learning how to get data from arrays and I am slightly stuck on an easy way of locating where that data is to pull it from the array. It feels like there should be an easier way than counting on the screen.
Here is what I have:
r2 = requests.get(
f'https://www.thesportsdb.com/api/v1/json/{apiKey}/lookupevent.php?id={id}')
arr_events = np.array([r2.json()])
#print(arr_events)
event_id = arr_events[0]['events'][0]['idEvent']
locate = arr_events.index('strHomeTeam')
print(locate)
The problem is, on the console this prints out a massive array that looks like (I'll give one line, you probably get the idea):
[{'events': [{'idEvent': '1032723', 'idSoccerXML': None, 'idAPIfootball': '592172', 'strEvent': 'Aston Villa vs Liverpool', 'strEventAlternate': 'Liverpool # Aston Villa', 'strFilename': 'English Premier League 2020-10-04 Aston Villa vs Liverpool'...}]}]
It's a sizeable array, enough to cause a minor slowdown if I neced to pull some info.
So, idEvent was easy to pull using the method above. And if I wanted some of these others in the top line, proabably not hard to count to 5 or 6. But I know there must be an easier way for Python to just locate the ones I want. For instance, I want the home and away team:
'strHomeTeam': 'Aston Villa', 'strAwayTeam': 'Liverpool',
So is there an easier way to just pull the 'strHomeTeam' rather than counting all the way to the point in the array?
I realise this is a basic question - and I have searched and searched, but everything seems to be in a single, really small array and they don't seem to explain getting the data from big arrays easily.
The JSON file is here: https://www.thesportsdb.com/api/v1/json/1/lookupevent.php?id=1032723
Thank you for your help on this - I appreciate it.

So is there an easier way to just pull the 'strHomeTeam' rather than counting all the way to the point in the array?
Try the below
data = {"events": [
{"idEvent": "1032723", "idSoccerXML": "", "idAPIfootball": "592172", "strEvent": "Aston Villa vs Liverpool",
"strEventAlternate": "Liverpool # Aston Villa",
"strFilename": "English Premier League 2020-10-04 Aston Villa vs Liverpool", "strSport": "Soccer",
"idLeague": "4328", "strLeague": "English Premier League", "strSeason": "2020-2021",
"strDescriptionEN": "Aston Villa and Liverpool square off at Villa Park, where last season, these teams produced one of the most exciting finishes of the campaign, as Liverpool scored twice late on to overturn an early Trezeguet goal.",
"strHomeTeam": "Aston Villa", "strAwayTeam": "Liverpool", "intHomeScore": "7", "intRound": "4",
"intAwayScore": "2", "intSpectators": "", "strOfficial": "", "strHomeGoalDetails": "", "strHomeRedCards": "",
"strHomeYellowCards": "", "strHomeLineupGoalkeeper": "", "strHomeLineupDefense": "",
"strHomeLineupMidfield": "", "strHomeLineupForward": "", "strHomeLineupSubstitutes": "",
"strHomeFormation": "", "strAwayRedCards": "", "strAwayYellowCards": "", "strAwayGoalDetails": "",
"strAwayLineupGoalkeeper": "", "strAwayLineupDefense": "", "strAwayLineupMidfield": "",
"strAwayLineupForward": "", "strAwayLineupSubstitutes": "", "strAwayFormation": "", "intHomeShots": "",
"intAwayShots": "", "strTimestamp": "2020-10-04T18:15:00+00:00", "dateEvent": "2020-10-04",
"dateEventLocal": "2020-10-04", "strDate": "", "strTime": "18:15:00", "strTimeLocal": "19:15:00",
"strTVStation": "", "idHomeTeam": "133601", "idAwayTeam": "133602", "strResult": "", "strVenue": "Villa Park",
"strCountry": "England", "strCity": "", "strPoster": "", "strFanart": "",
"strThumb": "https:\/\/www.thesportsdb.com\/images\/media\/event\/thumb\/r00vzl1601721606.jpg", "strBanner": "",
"strMap": "", "strTweet1": "https:\/\/twitter.com\/brfootball\/status\/1312843172385521665",
"strTweet2": "https:\/\/twitter.com\/TomJordan21\/status\/1312854281444306946",
"strTweet3": "https:\/\/twitter.com\/FutbolBible\/status\/1312847622592442370",
"strVideo": "https:\/\/www.youtube.com\/watch?v=0Nbw3jSafGM", "strStatus": "Match Finished", "strPostponed": "no",
"strLocked": "unlocked"}]}
filtered_data = [{'home':entry['strHomeTeam'],'away':entry['strAwayTeam']}for entry in data['events']]
print(filtered_data)
output
[{'home': 'Aston Villa', 'away': 'Liverpool'}]

Ug... I tried something different and it worked - sigh... I am sorry.
event_id = arr_events[0]['events'][0]['idEvent']
home_team = arr_events[0]['events'][0]['strHomeTeam']
away_team = arr_events[0]['events'][0]['strAwayTeam']
home_score = arr_events[0]['events'][0]['intHomeScore']
away_score = arr_events[0]['events'][0]['intAwayScore']
I assume this is the right way to do it.

You should look into
https://python-json-pointer.readthedocs.io/en/latest/tutorial.html
inspect the json, get the path you want to access the value -> use https://github.com/stefankoegl/python-json-pointer

How to parse tab-delimited text file with 4th column as json and remove certain keys?

I have a text file that is 26 Gb, The line format is as follow
/type/edition /books/OL10000135M 4 2010-04-24T17:54:01.503315 {"publishers": ["Bernan Press"], "physical_format": "Hardcover", "subtitle": "9th November - 3rd December, 1992", "key": "/books/OL10000135M", "title": "Parliamentary Debates, House of Lords, Bound Volumes, 1992-93", "identifiers": {"goodreads": ["6850240"]}, "isbn_13": ["9780107805401"], "languages": [{"key": "/languages/eng"}], "number_of_pages": 64, "isbn_10": ["0107805405"], "publish_date": "December 1993", "last_modified": {"type": "/type/datetime", "value": "2010-04-24T17:54:01.503315"}, "authors": [{"key": "/authors/OL2645777A"}], "latest_revision": 4, "works": [{"key": "/works/OL7925046W"}], "type": {"key": "/type/edition"}, "subjects": ["Government - Comparative", "Politics / Current Events"], "revision": 4}
I'm trying to get only the last columns which is a json and from that Json I'm only trying to save the "title", "isbn 13", "isbn 10"
I was able to save only the last column with this code
csv.field_size_limit(sys.maxsize)
# File names: to read in from and read out to
input_file = '../inputFile/ol_dump_editions_2019-10-31.txt'
output_file = '../outputFile/output.txt'
## ==================== ##
## Using module 'csv' ##
## ==================== ##
with open(input_file) as to_read:
with open(output_file, "w") as tmp_file:
reader = csv.reader(to_read, delimiter = "\t")
writer = csv.writer(tmp_file)
desired_column = [4] # text column
for row in reader: # read one row at a time
myColumn = list(row[i] for i in desired_column) # build the output row (process)
writer.writerow(myColumn) # write it
but this doesn't return a proper json object instead returns everything with a double quotations next to it. Also how would I extract certain values from the json as a new json
EDIT:
"{""publishers"": [""Bernan Press""], ""physical_format"": ""Hardcover"", ""subtitle"": ""9th November - 3rd December, 1992"", ""key"": ""/books/OL10000135M"", ""title"": ""Parliamentary Debates, House of Lords, Bound Volumes, 1992-93"", ""identifiers"": {""goodreads"": [""6850240""]}, ""isbn_13"": [""9780107805401""], ""languages"": [{""key"": ""/languages/eng""}], ""number_of_pages"": 64, ""isbn_10"": [""0107805405""], ""publish_date"": ""December 1993"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2010-04-24T17:54:01.503315""}, ""authors"": [{""key"": ""/authors/OL2645777A""}], ""latest_revision"": 4, ""works"": [{""key"": ""/works/OL7925046W""}], ""type"": {""key"": ""/type/edition""}, ""subjects"": [""Government - Comparative"", ""Politics / Current Events""], ""revision"": 4}"
EDIT 2:
so im trying to read this file which is a tab separated file with the following columns:
type - type of record (/type/edition, /type/work etc.)
key - unique key of the record. (/books/OL1M etc.)
revision - revision number of the record
last_modified - last modified timestamp
JSON - the complete record in JSON format
Im trying to read the JSON file and from that Json im only trying to get the "title", "isbn 13", "isbn 10" as a json and save it to the file as a row
so every row should look like the original but with only those key and values

Here's a straight-forward way of doing it. You would need to repeat this and extract the desired data from each line of the file as it's being read, line-by-line (the default way text file reading is handled in Python).
import json
line = '/type/edition /books/OL10000135M 4 2010-04-24T17:54:01.503315 {"publishers": ["Bernan Press"], "physical_format": "Hardcover", "subtitle": "9th November - 3rd December, 1992", "key": "/books/OL10000135M", "title": "Parliamentary Debates, House of Lords, Bound Volumes, 1992-93", "identifiers": {"goodreads": ["6850240"]}, "isbn_13": ["9780107805401"], "languages": [{"key": "/languages/eng"}], "number_of_pages": 64, "isbn_10": ["0107805405"], "publish_date": "December 1993", "last_modified": {"type": "/type/datetime", "value": "2010-04-24T17:54:01.503315"}, "authors": [{"key": "/authors/OL2645777A"}], "latest_revision": 4, "works": [{"key": "/works/OL7925046W"}], "type": {"key": "/type/edition"}, "subjects": ["Government - Comparative", "Politics / Current Events"], "revision": 4}'
csv_cols = line.split('\t')
json_data = json.loads(csv_cols[4])
#print(json.dumps(json_data, indent=4))
desired = {key: json_data[key] for key in ("title", "isbn_13", "isbn_10")}
result = json.dumps(desired, indent=4)
print(result)
Output from sample line:
{
"title": "Parliamentary Debates, House of Lords, Bound Volumes, 1992-93",
"isbn_13": [
"9780107805401"
],
"isbn_10": [
"0107805405"
]
}

So given that your current code returns the following:
result = '{""publishers"": [""Bernan Press""], ""physical_format"": ""Hardcover"", ""subtitle"": ""9th November - 3rd December, 1992"", ""key"": ""/books/OL10000135M"", ""title"": ""Parliamentary Debates, House of Lords, Bound Volumes, 1992-93"", ""identifiers"": {""goodreads"": [""6850240""]}, ""isbn_13"": [""9780107805401""], ""languages"": [{""key"": ""/languages/eng""}], ""number_of_pages"": 64, ""isbn_10"": [""0107805405""], ""publish_date"": ""December 1993"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2010-04-24T17:54:01.503315""}, ""authors"": [{""key"": ""/authors/OL2645777A""}], ""latest_revision"": 4, ""works"": [{""key"": ""/works/OL7925046W""}], ""type"": {""key"": ""/type/edition""}, ""subjects"": [""Government - Comparative"", ""Politics / Current Events""], ""revision"": 4}'
Looks like what you need to do is: First - Replace those double-double-quotes with regular double quotes, otherwise things are not parsible:
res = result.replace('""','"')
Now res is convertible to a JSON object:
import json
my_json = json.loads(res)
my_json now looks like this:
{'authors': [{'key': '/authors/OL2645777A'}],
'identifiers': {'goodreads': ['6850240']},
'isbn_10': ['0107805405'],
'isbn_13': ['9780107805401'],
'key': '/books/OL10000135M',
'languages': [{'key': '/languages/eng'}],
'last_modified': {'type': '/type/datetime',
'value': '2010-04-24T17:54:01.503315'},
'latest_revision': 4,
'number_of_pages': 64,
'physical_format': 'Hardcover',
'publish_date': 'December 1993',
'publishers': ['Bernan Press'],
'revision': 4,
'subjects': ['Government - Comparative', 'Politics / Current Events'],
'subtitle': '9th November - 3rd December, 1992',
'title': 'Parliamentary Debates, House of Lords, Bound Volumes, 1992-93',
'type': {'key': '/type/edition'},
'works': [{'key': '/works/OL7925046W'}]}
You can conveniently get any field you want from this object:
my_json['title']
# 'Parliamentary Debates, House of Lords, Bound Volumes, 1992-93'
my_json['isbn_10'][0]
# '0107805405'

Especially because your example is so large, I'd recommend using a specialized library such as pandas, which has a read_csv method, or even dask, which supports out-of-memory operations.
Both of these systems will automatically parse out the quotations for you, and dask will do so in "pieces" direct from disk so you never have to try to load 26GB into RAM.
In both libraries, you can then access the columns you want like this:
data = read_csv(PATH)
data["ColumnName"]
You can then parse these rows either using json.loads() (import json) or you can use the pandas/dask json implementations. If you can give some more details of what you're expecting, I can help you draft a more specific code example.
Good luck!

I saved your data to a file to see if i could read just the rows, let me know if this works:
lines = zzread.split('\n')
temp=[]
for to_read in lines:
if len(to_read) == 0:
break
new_to_read = '{' + to_read.split('{',1)[1]
temp.append(json.loads(new_to_read))
for row in temp:
print(row['isbn_13'])
If that works this should create a json for you:
lines = zzread.split('\n')
temp=[]
for to_read in lines:
if len(to_read) == 0:
break
new_to_read = '{' + to_read.split('{',1)[1]
temp.append(json.loads(new_to_read))
new_json=[]
for row in temp:
new_json.append({'title': row['title'], 'isbn_13': row['isbn_13'], 'isbn_10': row['isbn_10']})

Sequentially filescraping text files - a smarter way?

I'm trying to scrape some text files into a DB - the format is similar to this with a couple of 1000 segments like this :
Posted By
Date
John Keys
31.08.2019, 10:10 AM
Peter Hall 200 150
Ed Parker 14 1
Posted By
Date
John Keys
31.08.2019, 10:15 AM
Rose Stone 200 150
Travis Anderson 14 1
The records that are important are the records that are coming right after "Date" - so the logic is :
inside_match_flag =0
for line in ins:
if inside_match_flag == 1:
inside_match_flag = 2 # add one to it as we will get all lines
if line == "Posted By": # until we see Posted By again (or EOF)
inside_match_flag =0 # we are now outside the segment
if line == "Date" : # lines after Dates are the ones we want
inside_match_flag =1 # the following lines are to be stored
So this is the way I've done it (the above is not the running code) before doing this by keeping track of a flag and depending on the flag_value I know what lines are most likely coming next.
The issue is of course this about 'the lines coming next' - as I'm reading line per line, I can't easy grab out these segments as I don't want to rely on loading the complete file into memory (as it can go huge).
But the code always gets ugly when I implement something like this - and thinking anyone here that would have a lot smarter approach to do this ?
And note - I am also interested if there would be a super-smart compact way to do this if it requires to load all into memory where code doesn't get so ugly, if all is in memory I guess I can just look for DATE field and save all lines between until it sees Posted By again.
Edit 1
Note the number of players can be more than 2 per game, so a record could also look like this :
Posted By
Date
John Keys
31.08.2019, 10:10 AM
Peter Hall 200 150
Ed Parker 54 1
Rose Stone 20 15
Travis Anderson 1 150
Posted By
...
....
My dream format would be to have an object like this - example based on the match above with 4 players :
{
"Game 1:"
{
"posted by" : "john keys"
"date" : "31.08.2019, 10:10 AM"
"players" : {
{ 1, "Peter Hall, "200", "150" }
{ 2, Ed Parker, "54", "1" }
{ 3 , Rose Stone, "20", "15" }
{ 4, Travis Anderson, "1", "150" }
}
}
}
Note : not 100% correct json format there - and doesn't have to be json, just an object as I will throw them into a SQLite database where it's stored per game which should be illustrated above.

Optimized and memory-efficient generator function approach which yields records on demand:
import pprint
def extract_records(fname):
def prepare_record(rec):
return {'posted by': rec[0], 'date': rec[1],
'players': [[i] + p.rsplit(maxsplit=2)
for i, p in enumerate(rec[2:], 1)]}
with open(fname) as f:
record = []
add_item = False
for line in f:
line = line.strip()
if line == 'Date':
add_item = True
continue
elif line == 'Posted By':
add_item = False
if record:
yield prepare_record(record)
record = []
continue
if add_item:
record.append(line)
if record:
yield prepare_record(record)
records_gen = extract_records('datafile.txt') # generator
for rec in records_gen:
pprint.pprint(rec) # further processing, ex. inserting into DB
The output (2 sample records):
{'date': '31.08.2019, 10:10 AM',
'players': [[1, 'Peter Hall', '200', '150'],
[2, 'Ed Parker', '14', '1'],
[3, 'Rose Stone', '20', '15'],
[4, 'Travis Anderson', '1', '150']],
'posted by': 'John Keys'}
{'date': '31.08.2019, 10:15 AM',
'players': [[1, 'Rose Stone', '200', '150'],
[2, 'Travis Anderson', '14', '1']],
'posted by': 'John Keys'}

There is no magic method for this specific case. Here is an example solution:
buf_size = ...
start_marker = "Posted by\n"
date_marker = "Date\n"
def parse_game(filename)
fh = open(filename)
page = ""
buffer = True # just the start value
while buffer:
buffer = fh.read(buf_size)
page += buffer
records = page.split(start_marker)
if buffer:
page = records.pop()
for record in records:
# skip everything before "Date" and split by lines
chunks = record.split(date_marker, 1)[-1].split("\n")
posted_by, date = chunks[:2]
players = [chunk.split() for chunk in chunks[2:]]
yield {
"posted_by": posted_by,
"date": date,
"players": players
}
If you can read the whole file into memory, it will be just:
def read_game(filename):
for record in open(filename).read().split(start_marker):
# skip everything before "Date" and split by lines
chunks = record.split(date_marker, 1)[-1].split("\n")
posted_by, date = chunks[:2]
players = [chunk.split() for chunk in chunks[2:]]
yield {
"posted_by": posted_by,
"date": date,
"players": players
}
This solution is very similar to Roman's. It is slightly less memory efficient (assuming you have buf_size of memory), but will result in less IO

py2neo.error. InvalidSemanticsException

I am stuck into Invalid Semantic exception error following is my code:
import json
from py2neo import neo4j, Node, Relationship, Graph
graph = Graph()
graph.schema.create_uniqueness_constraint("Authors", "auth_name")
graph.schema.create_uniqueness_constraint("Mainstream_News", "id")
with open("example.json") as f:
for line in f:
while True:
try:
file = json.loads(line)
break
except ValueError:
# Not yet a complete JSON value
line += next(f)
# Now creating the node and relationships
news = graph.merge_one("Mainstream_News", {"id": unicode(file["_id"]["$oid"]), "entry_url": unicode(file["entry_url"]),"title":unicode(file["title"])})
authors = graph.merge_one("Authors", {"auth_name": unicode(file["auth_name"]), "auth_url" : unicode(file["auth_url"]), "auth_eml" : unicode(file["auth_eml"])})
graph.create_unique(Relationship(news, "hasAuthor", authors))
I am trying to connect the news node with authors node.My Json file looks like this:
{
"_id": {
"$oid": "54933912bf4620870115a2e3"
},
"auth_eml": "",
"auth_url": "",
"cat": [],
"auth_name": "Max Bond",
"out_link": [],
"entry_url": [
"http://www.usda.gov/wps/portal/usda/!ut/p/c5/04_SB8K8xLLM9MSSzPy8xBz9CP0os_gAC9-wMJ8QY0MDpxBDA09nXw9DFxcXQ-cAA_1wkA5kFaGuQBXeASbmnu4uBgbe5hB5AxzA0UDfzyM_N1W_IDs7zdFRUREAZXAypA!!/dl3/d3/L2dJQSEvUUt3QS9ZQnZ3LzZfUDhNVlZMVDMxMEJUMTBJQ01IMURERDFDUDA!/?navtype=SU&navid=AGRICULTURE"
],
"out_link_norm": [],
"title": "United States Department of Agriculture - Agriculture",
"entry_url_norm": [
"usda.gov/wps/portal/usda/!ut/p/c5/04_SB8K8xLLM9MSSzPy8xBz9CP0os_gAC9-wMJ8QY0MDpxBDA09nXw9DFxcXQ-cAA_1wkA5kFaGuQBXeASbmnu4uBgbe5hB5AxzA0UDfzyM_N1W_IDs7zdFRUREAZXAypA!!/dl3/d3/L2dJQSEvUUt3QS9ZQnZ3LzZfUDhNVlZMVDMxMEJUMTBJQ01IMURERDFDUDA!/"
],
"ts": 1290945374000,
"source_url": "",
"content": "\n<a\nhref=\"/wps/portal/usda/!ut/p/c4/04_SB8K8xLLM9MSSzPy8xBz9CP0os_gAC9-wMJ8QY0MDpxBDA09nXw9DFxcXQ-cAA_2CbEdFAEUOjoE!/?navid=AVIAN_INFLUENZA\">\n<b>Avian Influenza, Bird Flu</b></a> <br />\nThe official U.S. government web site for information on pandemic flu and avian influenza\n\n<strong>Pest Management</strong> <br />\nPest management policy, pesticide screening tool, evaluate pesticide risk, conservation\nbuffers, training modules.\n\n<strong>Weather and Climate</strong> <br />\nU.S. agricultural weather highlights, weekly weather and crop bulletin, major world crop areas\nand climatic profiles.\n"
}
The full exception error is like this:
File "/home/mohan/workspace/test.py", line 20, in <module>
news = graph.merge_one("Mainstream_News", {"id": unicode(file["_id"]["$oid"]), "entry_url": unicode(file["entry_url"]),"title":unicode(file["title"])})
File "/usr/local/lib/python2.7/dist-packages/py2neo/core.py", line 958, in merge_one
for node in self.merge(label, property_key, property_value, limit=1):
File "/usr/local/lib/python2.7/dist-packages/py2neo/core.py", line 946, in merge
response = self.cypher.post(statement, parameters)
File "/usr/local/lib/python2.7/dist-packages/py2neo/cypher/core.py", line 86, in post
return self.resource.post(payload)
File "/usr/local/lib/python2.7/dist-packages/py2neo/core.py", line 331, in post
raise_from(self.error_class(message, **content), error)
File "/usr/local/lib/python2.7/dist-packages/py2neo/util.py", line 235, in raise_from
raise exception
py2neo.error.InvalidSemanticsException: Cannot merge node using null property value for {'title': u'United States Department of Agriculture - Agriculture', 'id': u'54933912bf4620870115a2e3', 'entry_url': u"[u'http://www.usda.gov/wps/portal/usda/!ut/p/c5/04_SB8K8xLLM9MSSzPy8xBz9CP0os_gAC9-wMJ8QY0MDpxBDA09nXw9DFxcXQ-cAA_1wkA5kFaGuQBXeASbmnu4uBgbe5hB5AxzA0UDfzyM_N1W_IDs7zdFRUREAZXAypA!!/dl3/d3/L2dJQSEvUUt3QS9ZQnZ3LzZfUDhNVlZMVDMxMEJUMTBJQ01IMURERDFDUDA!/?navtype=SU&navid=AGRICULTURE']"}
Any suggestions to fix this ?

Yeah, I see what's going on here. If you look at the py2neo API and look for the merge_one function, it's defined this way:
merge_one(label, property_key=None, property_value=None)
Match or create a node by label and optional property and
return a single matching node. This method is intended to be
used with a unique constraint and does not fail if more than
one matching node is found.
The way that you're calling it is with a string first (label) and then a dictionary:
news = graph.merge_one("Mainstream_News", {"id": unicode(file["_id"]["$oid"]), "entry_url": unicode(file["entry_url"]),"title":unicode(file["title"])})
Your error message says that py2neo is treating the entire dictionary like a property name, and you haven't provided a property value.
So you're calling this function incorrectly. What you should probably be doing is merge_one only on the basis of the id property, then later adding the extra properties you need to the node that comes back.
You need to convert those merge_one calls into something like this:
news = graph.merge_one("Mainstream News", "id", unicode(file["_id]["$oid]))
Note this doesn't give you the extra properties, those you'd add later.

adding column 2 from a group of text files to 1 text file

I have a group of text files and I am looking to sequentially add the second column from each text file into a new text file. The files are tab delimited and of the following format:
name dave
age 35
job teacher
income 30000
I have generated a file with the 1st column of one of these files in the place of the second column to hopefully simplify the problem:
0 name
0 age
0 job
0 income
I have a large number of these files and would like to have them all in a tab delimited text file such as:
name dave mike sue
age 35 28 40
job teacher postman solicitor
income 30000 20000 40000
I have a text file containing just the names of all the files called all_libs.txt
so far I have written:
#make a sorted list of the file names
with open('all_libs.txt', 'r') as lib:
people = list([line.rstrip() for line in lib])
people_s = sorted(people)
i=0
while i< len(people_s):
with open(people_s[i]) as inf:
for line in inf:
parts = line.split() #split line into parts
if len(parts) > 1: #if more than 1 discrete unit in parts
with open("all_data.txt", 'a') as out_file: #append column2 to all_data
out_file.write((parts[1])+"\n")
i=i+1 #go to the next file in the list
As each new file is opened I would like to add it as a new column rather than just appending as a new line. Would really appreciate any help? I realize something like SQL would probably make this easy but I have never used it and don't really have time to commit to the learning curve for SQL. Many thanks.

This is a very impractical way to store your data - each record is distributed over all the lines, so it's going to be hard to reconstruct the records when reading the file and (as you've seen) to add records.
You should be using a standard format like csv or (even better in a case like this) json:
For example, you could save them as CSV like this:
name,age,job,income
dave,35,teacher,30000
mike,28,postman,20000
sue,40,solicitor,40000
Reading this file:
>>> import csv
>>> with open("C:/Users/Tim/Desktop/people.csv", newline="") as infile:
... reader = csv.DictReader(infile)
... people = list(reader)
Now you have a list of people:
>>> people
[{'income': '30000', 'age': '35', 'name': 'dave', 'job': 'teacher'},
{'income': '20000', 'age': '28', 'name': 'mike', 'job': 'postman'},
{'income': '40000', 'age': '40', 'name': 'sue', 'job': 'solicitor'}]
which you can access easily:
>>> for item in people:
... print("{0[name]} is a {0[job]}, earning {0[income]} per year".format(item))
...
dave is a teacher, earning 30000 per year
mike is a postman, earning 20000 per year
sue is a solicitor, earning 40000 per year
Adding new records now is only a matter of adding them to the end of your file:
>>> with open("C:/Users/Tim/Desktop/people.csv", "a", newline="") as outfile:
... writer = csv.DictWriter(outfile,
... fieldnames=["name","age","job","income"])
... writer.writerow({"name": "paul", "job": "musician", "income": 123456,
... "age": 70})
Result:
name,age,job,income
dave,35,teacher,30000
mike,28,postman,20000
sue,40,solicitor,40000
paul,70,musician,123456
Or you can save it as JSON:
>>> import json
>>> with open("C:/Users/Tim/Desktop/people.json", "w") as outfile:
... json.dump(people, outfile, indent=1)
Result:
[
{
"income": "30000",
"age": "35",
"name": "dave",
"job": "teacher"
},
{
"income": "20000",
"age": "28",
"name": "mike",
"job": "postman"
},
{
"income": "40000",
"age": "40",
"name": "sue",
"job": "solicitor"
}
]

file_1 = """
name dave1
age 351
job teacher1
income 300001"""
file_2 = """
name dave2
age 352
job teacher2
income 300002"""
file_3 = """
name dave3
age 353
job teacher3
income 300003"""
template = """
0 name
0 age
0 job
0 income"""
Assume that the above is read from the files
_dict = {}
def concat():
for cols in template.splitlines():
if cols:
_, col_name = cols.split()
_dict[col_name] = []
for each_file in [file_1, file_2, file_3]:
data = each_file.splitlines()
for line in data:
if line:
words = line.split()
_dict[words[0]].append(words[1])
_text = ""
for key in _dict:
_text += '\t'.join([key, '\t'.join(_dict[key]), '\n'])
return _text
print concat()
OUTPUT
job teacher1 teacher2 teacher3
age 351 352 353
name dave1 dave2 dave3
income 300001 300002 300003

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to slice a json file, only extract part of the fields - python

Related

Finding positions to "print" in large arrays for Python?

How to parse tab-delimited text file with 4th column as json and remove certain keys?

Sequentially filescraping text files - a smarter way?

py2neo.error. InvalidSemanticsException

adding column 2 from a group of text files to 1 text file

Categories

Resources