Python Scrapy Pipeline Edit Last Item? - python

I'm using a pipeline in Scrapy to output the scraped results into a JSON file. The pipeline places a comma after each item that is scraped however, I want to drop the comma for the last item. Is there a way to do that?
This is the pipeline:
class ExamplePipeline(object):
def open_spider(self, spider):
self.file = open('example.json', 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
line = json.dumps(
dict(item),
indent = 4,
sort_keys = True,
separators = (',', ': ')
) + ",\n"
self.file.write(line)
return item
And the sample output looks like:
[
{
"item1": "example",
"item2": "example"
},
{
"item1": "example",
"item2": "example"
},
]
What is the python method to find the last item and not give it a comma separator? I thought I could do something like if item[-1] ... but I can't get that working.
Any ideas?

To apply this to your pipeline, you'll have to seek back in your file and delete that comma:
See related Python - Remove very last character in file
class ExamplePipeline(object):
def close_spider(self, spider):
# go back 2 characters: \n and ,
self.file.seek(-2, os.SEEK_END)
# cut trailing data
self.file.truncate()
# save
self.file.write("]")
self.file.close()

Related

Adding a comma between JSON objects in a datafile with Python?

I have large file (about 3GB) which contains what looks like a JSON file but isn't because it lacks commas (,) between "observations" or JSON objects (I have about 2 million of these "objects" in my data file).
For example, this is what I have:
{
"_id": {
"$id": "fh37fc3huc3"
},
"messageid": "4757724838492485088139042828",
"attachments": [],
"usernameid": "47284592942",
"username": "Alex",
"server": "475774810304151552",
"text": "Must watch",
"type": "462050823720009729",
"datetime": "2018-08-05T21:20:20.486000+00:00",
"type": {
"$numberLong": "0"
}
}
{
"_id": {
"$id": "23453532dwq"
},
"messageid": "232534",
"attachments": [],
"usernameid": "273342",
"usernameid": "Alice",
"server": "475774810304151552",
"text": "https://www.youtube.com/",
"type": "4620508237200097wd29",
"datetime": "2018-08-05T21:20:11.803000+00:00",
"type": {
"$numberLong": "0"
}
And this is what I want (the comma between "observations"):
{
"_id": {
"$id": "fh37fc3huc3"
},
"messageid": "4757724838492485088139042828",
"attachments": [],
"username": "Alex",
"server": "475774810304151552",
"type": {
"$numberLong": "0"
}
},
{
"_id": {
"$id": "23453532dwq"
},
"messageid": "232534",
"attachments": [],
"usernameid": "Alice",
"server": "475774810304151552",
"type": {
"$numberLong": "0"
}
This is what I tried but it doesn't give me a comma where I need it:
import re
with open('dataframe.txt', 'r') as input, open('out.txt', 'w') as output:
output.write("[")
for line in input:
line = re.sub('', '},{', line)
output.write(' '+line)
output.write("]")
What can I do so that I can add a comma between each JSON object in my datafile?
This solution presupposes that none of the fields in JSON contains neither { nor }.
If we assume that there is at least one blank line between JSON dictionaries, an idea: let's maintain unclosed curly brackets count ({) as unclosed_count; and if we meet an empty line, we add the coma once.
Like this:
with open('test.json', 'r') as input_f, open('out.json', 'w') as output_f:
output_f.write("[")
unclosed_count = 0
comma_after_zero_added = True
for line in input_f:
unclosed_count_change = line.count('{') - line.count('}')
unclosed_count += unclosed_count_change
if unclosed_count_change != 0:
comma_after_zero_added = False
if line.strip() == '' and unclosed_count == 0 and not comma_after_zero_added:
output_f.write(",\n")
comma_after_zero_added = True
else:
output_f.write(line)
output_f.write("]")
Assuming sufficient memory, you can parse such a stream one object at a time using json.JSONDecoder.raw_decode directly, instead of using json.loads.
>>> x = '{"a": 1}\n{"b": 2}\n' # Hypothetical output of open("dataframe.txt").read()
>>> decoder = json.JSONDecoder()
>>> x = '{"a": 1}\n{"b":2}\n'
>>> decoder.raw_decode(x)
({'a': 1}, 8)
>>> decoder.raw_decode(x, 9)
({'b': 2}, 16)
The output of raw_decode is a tuple containing the first JSON value decoded and the position in the string where the remaining data starts. (Note that json.loads just creates an instance of JSONDecoder, and calls the decode method, which just calls raw_decode and artificially raises an exception if the entire input isn't consumed by the first decoded value.)
A little extra work is involved; note that you can't start decoding with whitespace, so you'll have to use the returned index to detect where the next value starts, following any additional whitespace at the returned index.
Another way to view your data is that you have multiple json records separated by whitespace. You can use the stdlib JSONDecoder to read each record, then strip whitespace and repeat til done. The decoder reads a record from a string and tells you how far it got. Apply that iteratively to the data until all is consumed. This is far less risky than making a bunch of assumptions about what data is contained in the json itself.
import json
def json_record_reader(filename):
with open(filename, encoding="utf-8") as f:
txt = f.read().lstrip()
decoder = json.JSONDecoder()
result = []
while txt:
data, pos = decoder.raw_decode(txt)
result.append(data)
txt = txt[pos:].lstrip()
return result
print(json_record_reader("data.json"))
Considering the size of your file, a memory mapped text file may be the better option.
If you're sure that the only place you will find a blank line is between two dicts, then you can go ahead with your current idea, after you fix its execution. For every line, check if it's empty. If it isn't, write it as-is. If it is, write a comma instead
with open('dataframe.txt', 'r') as input_file, open('out.txt', 'w') as output_file:
output_file.write("[")
for line in input_file:
if line.strip():
output_file.write(line)
else:
output_file.write(",")
output_file.write("]")
If you cannot guarantee that any blank line must be replaced by a comma, you need a different approach.
You want to replace a close-bracket, followed by an empty line (or multiple whitespace), followed by an open-bracket, with },{.
You can keep track of the previous two lines in addition to the current line, and if these are "}", "", and "{" in that order, then write a comma before writing the "{".
from collections import deque
with open('dataframe.txt', 'r') as input_file, open('out.txt', 'w') as output_file:
last_two_lines = deque(maxlen=2)
output_file.write("[")
for line in input_file:
line_s = line.strip()
if line_s == "{" and list(last_two_lines) == ["}", ""]:
output_file.write("," + line)
else:
output_file.write(line)
last_two_lines.append(line_s)
Alternatively, if you want to stick with regex, then you could do
with open('dataframe.txt') as input_file:
file_contents = input_file.read()
repl_contents = re.sub(r'\}(\s+)\{', r'},\1{', file_contents)
with open('out.txt', 'w') as output_file:
output_file.write(repl_contents)
Here, the regex r"\}(\s+)\{" matches the pattern we're looking for (\s+ matches multiple whitespace characters, and captures them in group 1, which we then use in the replacement string as \1.
Note that you will need to read and run re.sub on the entire file, which will be slow.

How can I delete an item from a json using the below method?

I need to remove data from a json, at the minute i am using the following code:
import json
with open('E:/file/timings.json', 'r+') as f:
qe = json.load(f)
for item in qe['times']:
if item['Proc'] == 'APS':
print(f'{item["Num"]}')
del item
json.dump(qe, f, indent=4, sort_keys=False, ensure_ascii=False)
This doesn't delete anything from the JSON, here is a small example of my JSON file
{
"times": [
{
"Num": "12345678901234567",
"Start_Time": "2016-12-14 15:54:35",
"Proc": "UPD",
},
{
"Num": "12345678901234567",
"Start_Time": "2016-12-08 15:34:05",
"Proc": "APS",
},
{
"Num": "12345678901234567",
"Start_Time": "2016-11-30 11:20:21",
"Proc": "Dev,
i would like it to look like this:
{
"times": [
{
"Num": "12345678901234567",
"Start_Time": "2016-12-14 15:54:35",
"Proc": "UPD",
},
{
"Num": "12345678901234567",
"Start_Time": "2016-11-30 11:20:21",
"Proc": "Dev,
as you can see the portion containing APS as the process has been removed
You could save your initial json and then create new one that doesn't contain items which 'Proc' is equal to 'APS' (here new_json) and then overwrite your json file with that new_json.
import json
content = json.loads(open('timings.json', 'r').read())
new_json = {'times': []}
for item in content['times']:
if item['Proc'] != 'APS':
new_json['times'].append(item)
file = open('timings.json', 'w')
file.write(json.dumps(new_json, indent=4, sort_keys=False, ensure_ascii=False))
file.close()
It is not a good practice to delete element while iterating the list.
Use:
import json
with open('E:/file/timings.json', 'r') as f:
qe = json.load(f)
qe = [item for item in qe['times'] if item['Proc'] != 'APS'] #Delete Required element.
with open('E:/file/timings.json', 'w') as f:
json.dump(qe, f, indent=4, sort_keys=False, ensure_ascii=False)
del as you're using it, removes the variable item from your session, but leaves the actual item untouched in the data structure. You need to explicitly remove whatever item is pointing to from your data structure. Also, you want to avoid deleting items from a list while you are iterating over said list. You should recreate your entire list:
qe['times'] = [item for item in qe['times'] if item['Proc'] != 'APS']
You can use a method if you need to print:
def keep_item(thing):
if item['Proc'] == 'APS':
print thing['Num']
return False
else:
return True
qe['times'] = [item for item in qe['times'] if keep_item(item)]
You can use the below method to remove the element from list:
for i,item in enumerate(qe['times']):
if item['Proc'] == 'APS':
qe['times'].pop(i)
and then write back to the JSON file.

How to produce custom JSON output from Scrapy?

I am working on a Scrapy script which should make output like:
{
"state": "FL",
"date": "2017-11-03T14:52:26.007Z",
"games": [
{
"name":"Game1"
},
{
"name":"Game2"
}
]
}
But for me it is making as below when I run scrapy crawl items -o data.json -t json. The repetition of state
[
{"state": "CA", "games": [], "crawlDate": "2014-10-04"},
{"state": "CA", "games": [], "crawlDate": "2014-10-04"},
]
The code is given below:
import scrapy
items.py
class Item(scrapy.Item):
state = scrapy.Field()
games = scrapy.Field()
In Spider file, item class is called as:
item = Item()
item['state'] = state
item['Date'] = '2014-10-04'
item['games'] = games
I know this is not complete code but it should give an idea what I am all about.
Ref. https://stackoverflow.com/a/43698923/8964297
You could try to write your own pipeline like this:
Put this into your pipelines.py file:
import json
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.file = open('scraped_items.json', 'w')
# Your scraped items will be saved in the file 'scraped_items.json'.
# You can change the filename to whatever you want.
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
line = json.dumps(
dict(item),
indent = 4,
sort_keys = True,
separators = (',', ': ')
) + ",\n"
self.file.write(line)
return item
Then modify your settings.py to include the following:
ITEM_PIPELINES = {
'YourSpiderName.pipelines.JsonWriterPipeline': 300,
}
Change YourSpiderName to the correct name of your spider.
Note that the file gets written directly by the pipeline, so you don't have to specify file and format with the -o and -t command line parameters.
Hope this gets you closer to what you need.

Read csv starting with leading spaces

I have a comma-separated file (from a third party) in which each line starts and ends with a space, the fields are quoted with a doublequote, and the file ends with a line with only a space.
"first_name";"last_name"
"John";"Doe"
"Anita";"Doe"
I try to read this with the following code.
import csv
import json
def read_csv(filename):
result = []
with open(filename, 'r', encoding='utf-8') as f:
csv_reader = csv.reader(f, delimiter=';', quotechar='"')
for line_index, line in enumerate(csv_reader):
if line_index == 0:
header = line
continue
result.append(dict(zip(header, line)))
return result
if __name__ == '__main__':
contents = read_csv('test.txt')
print(json.dumps(contents, indent=4, sort_keys=4))
This is my expected result:
[
{
"first_name": "John",
"last_name ": "Doe "
},
{
"first_name": "Anita",
"last_name ": "Doe "
}
]
However, it always takes the doublequotes as part of the first column, due to the leading spaces, plus it takes the last line also into account. This is the result I get:
[
{
" \"first_name\"": " \"John\"",
"last_name ": "Doe "
},
{
" \"first_name\"": " \"Anita\"",
"last_name ": "Doe "
},
{
" \"first_name\"": " "
}
]
How can I get rid of these leading and trailing spaces before the csv is parsed? The answer here shows how to remove spaces from fields after it is read, but that wouldn't be good here, since it's not the contents of the fields that I want to change, but the fields themselves.
By the way: I am using Python 3.5.
EDIT
I am skipping empty lines now using the following code:
# Skip empty lines
line = [column.strip() for column in line]
if not any(line):
continue
You can use skipinitialspace=True and use a csv.DictReader (which assumes the first row is a header and creates a dict for you of name->value instead of manually doing it yourself) instead, eg:
with open(filename) as fin:
csvin = csv.DictReader(fin, delimiter=';', skipinitialspace=True)
result = list(csvin)
Alternatively, if only rows with some value should be considered (ie, the last row with no values, or even iterim blanks row should be filtered out), you can use:
result = [row for row in csvin if any(row.values())]
Which'll give you:
[{'first_name': 'John', 'last_name ': 'Doe '},
{'first_name': 'Anita', 'last_name ': 'Doe '}]
And the result of that using json.dumps(result, indent=4, sort_keys=4)) is:
[
{
"first_name": "John",
"last_name ": "Doe "
},
{
"first_name": "Anita",
"last_name ": "Doe "
}
]

Parsing file into Parent/ Child format for a JSON file

I would like some help/ advice on how to parse this file for Gene ontology (.obo)
I am working to create a visualisation in D3, and need to create a "tree" file, in the JSON format -
{
"name": "flare",
"description": "flare",
"children": [
{
"name": "analytic",
"description": "analytics",
"children": [
{
"name": "cluster",
"description": "cluster",
"children": [
{"name": "Agglomer", "description": "AgglomerativeCluster", "size": 3938},
{"name": "Communit", "description": "CommunityStructure", "size": 3812},
{"name": "Hierarch", "description": "HierarchicalCluster", "size": 6714},
{"name": "MergeEdg", "description": "MergeEdge", "size": 743}
]
}, etc..
This format seems fairly easy to replicate in a dictionary in python, with 3 fields for each entry: name, description, and children[].
My probelm here is actually HOW to extract the data. The file linked above has "objects" structured as:
[Term]
id: GO:0000001
name: mitochondrion inheritance
namespace: biological_process
def: "The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton." [GOC:mcc, PMID:10873824, PMID:11389764]
synonym: "mitochondrial inheritance" EXACT []
is_a: GO:0048308 ! organelle inheritance
is_a: GO:0048311 ! mitochondrion distribution
Where I will need the id, is_a and name fields. I have tried using python to parse this, but I cant seem to find a way to locate each object.
Any ideas?
Here's a fairly simple way to parse the objects in your '.obo' file. It saves the object data into a dict with the id as the key and the name and is_a data saved in a list. Then it pretty-prints it using the standard json module's .dumps function.
For testing purposes, I used a truncated version of the file in your link that only includes up to id: GO:0000006.
This code ignores any objects that contain the is_obsolete field. It also removes the description info from the is_a fields; I figured you probably wanted that, but it's easy enough to disable that functionality.
#!/usr/bin/env python
''' Parse object data from a .obo file
From http://stackoverflow.com/q/32989776/4014959
Written by PM 2Ring 2015.10.07
'''
from __future__ import print_function, division
import json
from collections import defaultdict
fname = "go-basic.obo"
term_head = "[Term]"
#Keep the desired object data here
all_objects = {}
def add_object(d):
#print(json.dumps(d, indent = 4) + '\n')
#Ignore obsolete objects
if "is_obsolete" in d:
return
#Gather desired data into a single list,
# and store it in the main all_objects dict
key = d["id"][0]
is_a = d["is_a"]
#Remove the next line if you want to keep the is_a description info
is_a = [s.partition(' ! ')[0] for s in is_a]
all_objects[key] = d["name"] + is_a
#A temporary dict to hold object data
current = defaultdict(list)
with open(fname) as f:
#Skip header data
for line in f:
if line.rstrip() == term_head:
break
for line in f:
line = line.rstrip()
if not line:
#ignore blank lines
continue
if line == term_head:
#end of term
add_object(current)
current = defaultdict(list)
else:
#accumulate object data into current
key, _, val = line.partition(": ")
current[key].append(val)
if current:
add_object(current)
print("\nall_objects =")
print(json.dumps(all_objects, indent = 4, sort_keys=True))
output
all_objects =
{
"GO:0000001": [
"mitochondrion inheritance",
"GO:0048308",
"GO:0048311"
],
"GO:0000002": [
"mitochondrial genome maintenance",
"GO:0007005"
],
"GO:0000003": [
"reproduction",
"GO:0008150"
],
"GO:0000006": [
"high-affinity zinc uptake transmembrane transporter activity",
"GO:0005385"
]
}

Categories

Resources