I have a bunch of (modified) RIS files. The toy example looks like the following:
Record #1 of 2
ID: CN-01160769
AU: Uedo N
AU: Kasiser R
TI: Development of an E-learning system
SO: United European Gastroenterology Journal
YR: 2015
Record #2 of 2
ID: CN-01070265
AU: Krogh LQ
TI: E-learning in pediatric basic life support
SO: Resuscitation
YR: 2015
In brief, each record starts with Record # line and ends with two blank lines. The task is to parse the file and extract tags and fields.
Pasted below is my current code (adapted from here):
import re
class RIS:
""" RIS file structure """
def __init__(self, in_file=None):
""" Initialize and parse input """
self.records = []
if in_file:
self.parse(in_file)
def parse(self, in_file):
""" Parse input file """
self.current_tag = None
self.current_record = None
prog = re.compile("^([A-Z][A-Z0-9]): (.*)")
lines = []
# Eliminate blank lines
for line in in_file:
line = line.strip()
if len(line) > 0:
lines.append(line)
for line in lines:
match = prog.match(line)
if match:
tag = match.groups()[0]
field = match.groups()[1]
self.process_field(tag, field)
else:
raise ValueError(line)
def process_field(self, tag, field):
""" Process RIS file field """
if tag == "ID":
self.current_record = {tag: field}
elif tag == "YR":
self.records.append(self.current_record)
self.current_record = None
elif tag in ["AU", "AD"]:
if tag in self.current_record:
self.current_record[tag].append(field)
else:
self.current_record[tag] = [field]
else:
if not tag in self.current_record:
self.current_record[tag] = field
else:
error_str = "Duplicate tag: %s" % tag
raise ValueError(error_str)
def main():
""" Test the code """
import pprint
with open("test.ris", "rt") as ris_file:
ris = RIS(ris_file)
pp = pprint.PrettyPrinter()
pp.pprint(ris.records)
if __name__ == "__main__":
main()
The current code doesn't work, because it doesn't recognize the start tag (e.g., Record 1 of 2) and in addition it doesn't know where the record stops. In the current version of the code I add ID as a start tag and YR as stop tag. However, the code exit with the error:
ValueError: Record #1 of 2
Any suggestions how to properly adapt the code are greatly welcome.
you just need add a judge and break the Record #x of 2 line.
import re
class RIS:
""" RIS file structure """
def __init__(self, in_file=None):
""" Initialize and parse input """
self.records = []
if in_file:
self.parse(in_file)
def parse(self, in_file):
""" Parse input file """
self.current_tag = None
self.current_record = None
prog = re.compile("^([A-Z][A-Z0-9]): (.*)")
lines = []
# Eliminate blank lines
for line in in_file:
line = line.strip()
if len(line) > 0:
lines.append(line)
for line in lines:
if "#" in line:
continue
match = prog.match(line)
if match:
tag = match.groups()[0]
field = match.groups()[1]
self.process_field(tag, field)
else:
raise ValueError(line)
def process_field(self, tag, field):
""" Process RIS file field """
if tag == "ID":
self.current_record = {tag: field}
elif tag == "YR":
self.records.append(self.current_record)
self.current_record = None
elif tag in ["AU", "AD"]:
if tag in self.current_record:
self.current_record[tag].append(field)
else:
self.current_record[tag] = [field]
else:
if not tag in self.current_record:
self.current_record[tag] = field
else:
error_str = "Duplicate tag: %s" % tag
raise ValueError(error_str)
def main():
""" Test the code """
import pprint
with open("test.ris", "rt") as ris_file:
ris = RIS(ris_file)
pp = pprint.PrettyPrinter()
pp.pprint(ris.records)
if __name__ == "__main__":
main()
the add code:
if "#" in line:
continue
the output is
[{'AU': ['Uedo N', 'Kasiser R'],
'ID': 'CN-01160769',
'SO': 'United European Gastroenterology Journal',
'TI': 'Development of an E-learning system'},
{'AU': ['Krogh LQ'],
'ID': 'CN-01070265',
'SO': 'Resuscitation',
'TI': 'E-learning in pediatric basic life support'}]
Related
I am mostly trying to create software that reads says the definition of every word you typed into the text box. Right now it only reads if there is one work and crashes if there is more than one. How would I go about fixing this?
import wolframalpha
client = wolframalpha.Client('8QR2WG-628657K83Q')
from multiprocessing import Process
import wikipedia
import PySimpleGUI as sg
import cv2
import random
import sys
import threading
import time
import nltk
nltk.download('punkt')
# from oxforddictionaries.words import OxfordDictionaries
# Oxford = OxfordDictionaries('b4170561','f32687e0ecbc219cfd723bb220dad34e')
# o = OxfordDictionaries('b4170561','f32687e0ecbc219cfd723bb220dad34e')
# relax = o.get_synonyms("Apple").json()
# synonyms = relax
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
filtered_list = []
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')
stemmer = PorterStemmer()
trained_face_data = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
trained_body_data = cv2.CascadeClassifier('haarcascade_upperbody.xml')
trained_eye_data = cv2.CascadeClassifier('haarcascade_eye.xml')
webcam = cv2.VideoCapture(0)
sg.theme('graygraygray')
layout = [ [sg.Text("Enter Test Text")],
[sg.Input()],
[sg.Button('Ok')] ]
window = sg.Window('You', layout)
sg.Popup('About Me','Hello I am an AI devolped by Garrett Provence. I will be using your webcam to scan your suroundings for a quick few seconds and will open a text box where you will be able to ask me questions. By clicking ok below you agree to letting me acess everyhting said before. I am still in beta so please be patient.')
timeout = time.time() + 10;
while True:
##Webcam scanner
def infiniteloop1():
while True:
test = 0
if test == 5 or time.time() > timeout:
break
test = test - 1
successful_frame_read, frame = webcam.read()
grayscaled_img = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
face_coordinates = trained_face_data.detectMultiScale(grayscaled_img)
body_coordinates = trained_body_data.detectMultiScale(grayscaled_img)
eye_coordinates = trained_eye_data.detectMultiScale(grayscaled_img)
for (x,y,w,h) in face_coordinates:
cv2.rectangle(frame, (x, y),(x+w, y+h), (0,random.randrange(255),0), 2)
for (x,y,w,h) in body_coordinates:
cv2.rectangle(frame, (x, y),(x+w, y+h), (0,0,255), 2)
for (x,y,w,h) in eye_coordinates:
cv2.rectangle(frame, (x, y),(x+w, y+h), (random.randrange(255),0,0), 2)
cv2.imshow('FaceThing',frame)
cv2.waitKey(1)
thread1 = threading.Thread(target=infiniteloop1)
thread1.start()
event, values = window.read()
InputText = values[0]
import json
import requests
import os
import pprint
import Oxfordwords
from Oxfordwords import Word
import pprint
##end OF webcam scanner
#img = cv2.imread('Rdj.png')
while True:
##Test Text Scanner --
Text = values[0]
Word.get(Text)
if event == sg.WIN_CLOSED or event == 'Cancel':
break
sys.exit()
try:
words_in_excerpt = word_tokenize(Text)
nltk.pos_tag(words_in_excerpt)
print('Hello',nltk.pos_tag(words_in_excerpt), "")
sg.Popup('Test', nltk.pos_tag(words_in_excerpt))
sg.Popup('Def',Word.definitions())
break
except:
sg.Popup('There seems to be a error processing what you have said')
break
##End of test Text Scanner --
The oxford dictonary code -
#!/bin/env python3
""" oxford dictionary api """
from http import cookiejar
import requests
from bs4 import BeautifulSoup as soup
class WordNotFound(Exception):
""" word not found in dictionary (404 status code) """
pass
class BlockAll(cookiejar.CookiePolicy):
""" policy to block cookies """
return_ok = set_ok = domain_return_ok = path_return_ok = lambda self, *args, **kwargs: False
netscape = True
rfc2965 = hide_cookie2 = False
class Word(object):
""" retrive word info from oxford dictionary website """
entry_selector = '#entryContent > .entry'
header_selector = '.top-container'
title_selector = header_selector + ' .headword'
wordform_selector = header_selector + ' .pos'
property_global_selector = header_selector + ' .grammar'
br_pronounce_selector = '[geo=br] .phon'
am_pronounce_selector = '[geo=n_am] .phon'
br_pronounce_audio_selector = '[geo=br] [data-src-ogg]'
am_pronounce_audio_selector = '[geo=n_am] [data-src-ogg]'
definition_body_selector = '.senses_multiple'
namespaces_selector = '.senses_multiple > .shcut-g'
examples_selector = '.senses_multiple .sense > .examples .x'
definitions_selector = '.senses_multiple .sense > .def'
extra_examples_selector = '.res-g [title="Extra examples"] .x-gs .x'
phrasal_verbs_selector = '.phrasal_verb_links a'
idioms_selector = '.idioms > .idm-g'
other_results_selector = '#rightcolumn #relatedentries'
soup_data = None
#classmethod
def get_url(cls, word):
""" get url of word definition """
baseurl = 'https://www.oxfordlearnersdictionaries.com/definition/english/'
return baseurl + word
#classmethod
def delete(cls, selector):
""" remove tag with specified selector in cls.soup_data """
try:
for tag in cls.soup_data.select(selector):
tag.decompose()
except IndexError:
pass
#classmethod
def get(cls, word):
""" get html soup of word """
req = requests.Session()
req.cookies.set_policy(BlockAll())
page_html = req.get(cls.get_url(word), timeout=5, headers={'User-agent': 'mother animal'})
if page_html.status_code == 404:
raise WordNotFound
else:
cls.soup_data = soup(page_html.content, 'html.parser')
if cls.soup_data is not None:
# remove some unnecessary tags to prevent false positive results
cls.delete('[title="Oxford Collocations Dictionary"]')
cls.delete('[title="British/American"]') # edge case: 'phone'
cls.delete('[title="Express Yourself"]')
cls.delete('[title="Collocations"]')
cls.delete('[title="Word Origin"]')
#classmethod
def other_results(cls):
""" get similar words, idioms, phrases...
Return: {
'All matches': [
{'word1': word1, 'id1': id1, 'wordform1': wordform1},
{'word2': word2, 'id2': id2, 'wordform2': wordform2}
...
]
'Phrasal verbs': [
{'word1': word1, 'id1': id1, 'wordform1': wordform1},
{'word2': word2, 'id2': id2, 'wordform2': wordform2}
...
]
...
}
"""
info = []
try:
rightcolumn_tags = cls.soup_data.select(cls.other_results_selector)[0]
except IndexError:
return None
# there can be multiple other results table like All matches, Phrasal verbs, Idioms,...
header_tags = rightcolumn_tags.select('dt')
other_results_tags = rightcolumn_tags.select('dd')
# loop each other result table
for header_tag, other_results_tag in zip(header_tags, other_results_tags):
header = header_tag.text
other_results = []
for item_tag in other_results_tag.select('li'):
names = item_tag.select('span')[0].find_all(text=True, recursive=False)
wordform_tag = item_tag.select('pos')
names.append(wordform_tag[0].text if len(wordform_tag) > 0 else '')
other_results.append(names)
other_results = list(filter(None, other_results)) # remove empty list
ids = [cls.extract_id(tag.attrs['href'])
for tag in other_results_tag.select('li a')]
results = []
for other_result, id in zip(other_results, ids):
result = {}
result['name'] = ' '.join(list(map(lambda x: x.strip(), other_result[0:-1])))
result['id'] = id
try:
result['wordform'] = other_result[-1].strip()
except IndexError:
pass
results.append(result)
info.append({header: results})
return info
#classmethod
def name(cls):
""" get word name """
if cls.soup_data is None:
return None
return cls.soup_data.select(cls.title_selector)[0].text
#classmethod
def id(cls):
""" get id of a word. if a word has definitions in 2 seperate pages
(multiple wordform) it will return 'word_1' and 'word_2' depend on
which page it's on """
if cls.soup_data is None:
return None
return cls.soup_data.select(cls.entry_selector)[0].attrs['id']
#classmethod
def wordform(cls):
""" return wordform of word (verb, noun, adj...) """
if cls.soup_data is None:
return None
try:
return cls.soup_data.select(cls.wordform_selector)[0].text
except IndexError:
return None
#classmethod
def property_global(cls):
""" return global property (apply to all definitions) """
if cls.soup_data is None:
return None
try:
return cls.soup_data.select(cls.property_global_selector)[0].text
except IndexError:
return None
#classmethod
def get_prefix_from_filename(cls, filename):
""" get prefix (NAmE or BrE) from audio name when prefix is null """
if '_gb_' in filename:
return 'BrE'
elif '_us_' in filename:
return 'NAmE'
return None
#classmethod
def pronunciations(cls):
""" get britain and america pronunciations """
if cls.soup_data is None:
return None
britain = {'prefix': None, 'ipa': None, 'url': None}
america = {'prefix': None, 'ipa': None, 'url': None}
try:
britain_pron_tag = cls.soup_data.select(cls.br_pronounce_selector)[0]
america_pron_tag = cls.soup_data.select(cls.am_pronounce_selector)[0]
britain['ipa'] = britain_pron_tag.text
britain['prefix'] = 'BrE'
america['ipa'] = america_pron_tag.text
america['prefix'] = 'nAmE'
except IndexError:
pass
try:
britain['url'] = cls.soup_data.select(cls.br_pronounce_audio_selector)[0].attrs['data-src-ogg']
america['url'] = cls.soup_data.select(cls.am_pronounce_audio_selector)[0].attrs['data-src-ogg']
except IndexError:
pass
if britain['prefix'] == None and britain['url'] is not None:
britain['prefix'] = cls.get_prefix_from_filename(britain['url'])
if america['prefix'] == None and america['url'] is not None:
america['prefix'] = cls.get_prefix_from_filename(america['url'])
return [britain, america]
#classmethod
def extract_id(cls, link):
""" get word id from link
Argument: https://abc/definition/id
Return: id
"""
return link.split('/')[-1]
#classmethod
def get_references(cls, tags):
""" get info about references to other page
Argument: soup.select(<selector>)
Return: [{'id': <id>, 'name': <word>}, {'id': <id2>, 'name': <word2>}, ...]
"""
if cls.soup_data is None:
return None
references = []
for tag in tags.select('.xrefs a'): # see also <external link>
id = cls.extract_id(tag.attrs['href'])
word = tag.text
references.append({'id': id, 'name': word})
return references
#classmethod
def references(cls):
""" get global references """
if cls.soup_data is None:
return None
header_tag = cls.soup_data.select(cls.header_selector)[0]
return cls.get_references(header_tag)
#classmethod
def definitions(cls, full=False):
""" Return: list of definitions """
if cls.soup_data is None:
return None
if not full:
return [tag.text for tag in cls.soup_data.select(cls.definitions_selector)]
return cls.definition_full()
#classmethod
def examples(cls):
""" List of all examples (not categorized in seperate definitions) """
if cls.soup_data is None:
return None
return [tag.text for tag in cls.soup_data.select(cls.examples_selector)]
#classmethod
def phrasal_verbs(cls):
""" get phrasal verbs list (verb only) """
if cls.soup_data is None:
return None
phrasal_verbs = []
for tag in cls.soup_data.select(cls.phrasal_verbs_selector):
phrasal_verb = tag.select('.xh')[0].text
id = cls.extract_id(tag.attrs['href']) # https://abc/definition/id -> id
phrasal_verbs.append({'name': phrasal_verb, 'id': id})
return phrasal_verbs
#classmethod
def _parse_definition(cls, parent_tag):
""" return word definition + corresponding examples
A word can have a single (None) or multiple namespaces
Each namespace can have one or many definitions
Each definitions can have one, many or no examples
Some words can have specific property
(transitive/intransitive/countable/uncountable/singular/plural...)
A verb can have phrasal verbs
"""
if cls.soup_data is None:
return None
definition = {}
try: # property (countable, transitive, plural,...)
definition['property'] = parent_tag.select('.grammar')[0].text
except IndexError:
pass
try: # label: (old-fashioned), (informal), (saying)...
definition['label'] = parent_tag.select('.labels')[0].text
except IndexError:
pass
try: # refer to something (of people, of thing,...)
definition['refer'] = parent_tag.select('.dis-g')[0].text
except IndexError:
pass
definition['references'] = cls.get_references(parent_tag)
if not definition['references']:
definition.pop('references', None)
try: # sometimes, it just refers to other page without having a definition
definition['description'] = parent_tag.select('.def')[0].text
except IndexError:
pass
definition['examples'] = [example_tag.text
for example_tag in parent_tag.select('.examples .x')]
definition['extra_example'] = [
example_tag.text
for example_tag in parent_tag.select('[unbox=extra_examples] .examples .unx')
]
return definition
#classmethod
def definition_full(cls):
""" return word definition + corresponding examples
A word can have a single (None) or multiple namespaces
Each namespace can have one or many definitions
Each definitions can have one, many or no examples
Some words can have specific property
(transitive/intransitive/countable/uncountable/singular/plural...)
A verb can have phrasal verbs
"""
if cls.soup_data is None:
return None
namespace_tags = cls.soup_data.select(cls.namespaces_selector)
info = []
for namespace_tag in namespace_tags:
try:
namespace = namespace_tag.select('h2.shcut')[0].text
except IndexError:
# some word have similar definitions grouped in a multiple namespaces (time)
# some do not, and only have one namespace (woman)
namespace = None
definitions = []
definition_full_tags = namespace_tag.select('.sense')
for definition_full_tag in definition_full_tags:
definition = cls._parse_definition(definition_full_tag)
definitions.append(definition)
info.append({'namespace': namespace, 'definitions': definitions})
# no namespace. all definitions is global
if len(info) == 0:
info.append({'namespace': '__GLOBAL__', 'definitions': []})
def_body_tags = cls.soup_data.select(cls.definition_body_selector)
definitions = []
definition_full_tags = def_body_tags[0].select('.sense')
for definition_full_tag in definition_full_tags:
definition = cls._parse_definition(definition_full_tag)
definitions.append(definition)
info[0]['definitions'] = definitions
return info
#classmethod
def idioms(cls):
""" get word idioms
Idioms dont have namespace like regular definitions
Each idioms have one or more definitions
Each definitions can have one, many or no examples
"""
idiom_tags = cls.soup_data.select(cls.idioms_selector)
idioms = []
for idiom_tag in idiom_tags:
try:
# sometimes idiom is in multiple idm classes inside
# one idm-l class instead of a single idm class
idiom = idiom_tag.select('.idm-l')[0].text
except IndexError:
idiom = idiom_tag.select('.idm')[0].text
global_definition = {}
try: # label: (old-fashioned), (informal), (saying)...
global_definition['label'] = idiom_tag.select('.labels')[0].text
except IndexError:
pass
try: # refer to something (of people, of thing,...)
global_definition['refer'] = idiom_tag.select('.dis-g')[0].text
except IndexError:
pass
global_definition['references'] = cls.get_references(idiom_tag)
if not global_definition['references']:
global_definition.pop('references', None)
definitions = []
# one idiom can have multiple definitions, each can have multiple examples or no example
for definition_tag in idiom_tag.select('.sense'):
definition = {}
try: # sometimes, it just refers to other page without having a definition
definition['description'] = definition_tag.select('.def')[0].text
except IndexError:
pass
try: # label: (old-fashioned), (informal), (saying)...
definition['label'] = definition_tag.select('.labels')[0].text
except IndexError:
pass
try: # refer to something (of people, of thing,...)
definition['refer'] = definition_tag.select('.dis-g')[0].text
except IndexError:
pass
definition['references'] = cls.get_references(definition_tag)
if not definition['references']:
definition.pop('references', None)
definition['examples'] = [example_tag.text for example_tag in definition_tag.select('.x')]
definitions.append(definition)
idioms.append({'name': idiom, 'summary': global_definition, 'definitions': definitions})
return idioms
#classmethod
def info(cls):
""" return all info about a word """
if cls.soup_data is None:
return None
word = {
'id': cls.id(),
'name': cls.name(),
'wordform': cls.wordform(),
'pronunciations': cls.pronunciations(),
'property': cls.property_global(),
'definitions': cls.definitions(full=True),
'idioms': cls.idioms(),
'other_results': cls.other_results()
}
if not word['property']:
word.pop('property', None)
if not word['other_results']:
word.pop('other_results', None)
if word['wordform'] == 'verb':
word['phrasal_verbs'] = cls.phrasal_verbs()
return word
Any help will be appreciated thank you:)
just split values[0] into words and call Word.get(...) on each
import re
while True:
##Test Text Scanner --
words = re.findall(r"\w+", values[0].strip()) # can also use nltk.word_tokenize
for word in words:
Word.get(word)
if event == sg.WIN_CLOSED or event == 'Cancel':
break
try:
words_in_excerpt = word_tokenize(Text)
nltk.pos_tag(words_in_excerpt)
print('Hello', nltk.pos_tag(words_in_excerpt), "")
sg.Popup('Test', nltk.pos_tag(words_in_excerpt))
sg.Popup('Def', Word.definitions())
break
except:
sg.Popup('There seems to be a error processing what you have said')
break
I have to extract text from PDF pages as it is with the indentation into a CSV file.
Index page from PDF text book:
I should split the text into class and subclass type hierarchy along with the page numbers. For example in the image,
Application server is the class and Apache Tomcat is the subclass in the page number 275
This is the expected output of the CSV:
I have used Tika parser to parse the PDF, but the indentation is not maintained properly (not unique) in the parsed content for splitting the text into class and subclasses.
This is how the parsed text looks like:
Can anyone suggest me the right approach for this requirement?
despite I have no knowledge of pdf extraction, but it is possible to reconstruct the hierarchy from "the parsed text", because the "subclass" part always starts and ends with an extra newline character.
with following test text:
app architect . 50
app logic . 357
app server . 275
tomcat . 275
websphere . 275
jboss . 164
architect
acceptance . 303
development path . 304
architecting . 48
architectural activity . 25, 320
following code:
import csv
import sys
import re
def gen():
is_subclass = False
p_class = None
with open('test.data') as f:
s = f.read()
lines = re.findall(r'[^\n]+\n+', s)
for line in lines:
if ' . ' in line:
class_name, page_no = map(lambda s: s.strip(), line.split('.'))
else:
class_name, page_no = line.strip(), ''
if line.endswith('\n\n'):
if not is_subclass:
p_class = class_name
is_subclass = True
continue
if is_subclass:
yield (p_class, class_name, page_no)
else:
yield (class_name, '', page_no)
if line.endswith('\n\n'):
is_subclass = False
writer = csv.writer(sys.stdout)
writer.writerows(gen())
yields:
app architect,,50
app logic,,357
app server,tomcat,275
app server,websphere,275
app server,jboss,164
architect,acceptance,303
architect,development path,304
architecting,,48
architectural activity,,"25, 320"
hope this helps.
So here is the solution:
Install Fitz(PyMuPDF) https://github.com/rk700/PyMuPDF
Run the code below in the same folder than your PDF file with Python 2.7
Compare the result
Code:
import fitz
import json
import re
import csv
class MyClass:
def __init__(self, text, main_class):
my_arr = re.split("[.]*", text)
if main_class != my_arr[0].strip():
main_class = my_arr[0].strip()
self.main_class = main_class
self.sub_class = my_arr[0].strip()
try:
self.page = my_arr[1].strip()
except:
self.page = ""
def add_line(text, is_recording, main_class):
if(is_recording):
obj = MyClass(text, main_class)
if obj.sub_class == "Glossary":
return False, main_class
table.append(obj)
return True, obj.main_class
elif text == "Contents":
return True, main_class
return False, main_class
last_text = ""
is_recording = False
main_class = ""
table = []
doc = fitz.open("TCS_1.pdf")
page = doc.getPageText(2, output="json")
blocks = json.loads(page)["blocks"]
for block in blocks:
if "lines" in block:
for line in block["lines"]:
line_text = ""
for span in block["lines"]:
line_text += span["spans"][0]["text"].encode("utf-8")
if last_text != line_text:
is_recording, main_class = add_line(line_text, is_recording, main_class)
last_text = line_text
writer = csv.writer(open("output.csv", 'w'), delimiter=',', lineterminator='\n')
for my_class in table:
writer.writerow([my_class.main_class, my_class.sub_class, my_class.page])
# print(my_class.main_class, my_class.sub_class, my_class.page)
Here is the CSV output of the file provided:
I've solved the problem. The problem is related my %PATH%
I have a script which work with an argument. In powershell I've tried the command you can see below;
.\dsrf2csv.py C:\Python27\a\DSR_testdata.tsv.gz
And also you can see the script below,
def __init__(self, dsrf2csv_arg):
self.dsrf_filename = dsrf2csv_arg
dsrf_path, filename = os.path.split(self.dsrf_filename)
self.report_outfilename = os.path.join(dsrf_path, filename.replace('DSR', 'Report').replace('tsv', 'csv'))
self.summary_outfilename = os.path.join(dsrf_path, filename.replace('DSR', 'Summary').replace('tsv.gz', 'csv'))
But when I try to run this script there is no any action. How should I run this script with a file? (example : testdata.tsv.gz)
Note : Script and file in same location.
Full Scritp;
import argparse
import atexit
import collections
import csv
import gzip
import os
SKIP_ROWS = ['HEAD', '#HEAD', '#SY02', '#SY03', '#AS01', '#MW01', '#RU01',
'#SU03', '#LI01', '#FOOT']
REPORT_HEAD = ['Asset_ID', 'Asset_Title', 'Asset_Artist', 'Asset_ISRC',
'MW_Asset_ID', 'MW_Title', 'MW_ISWC', 'MW_Custom_ID',
'MW_Writers', 'Views', 'Owner_name', 'Ownership_Claim',
'Gross_Revenue', 'Amount_Payable', 'Video_IDs', 'Video_views']
SUMMARY_HEAD = ['SummaryRecordId', 'DistributionChannel',
'DistributionChannelDPID', 'CommercialModel', 'UseType',
'Territory', 'ServiceDescription', 'Usages', 'Users',
'Currency', 'NetRevenue', 'RightsController',
'RightsControllerPartyId', 'AllocatedUsages', 'AmountPayable',
'AllocatedNetRevenue']
class DsrfConverter(object):
"""Converts DSRF 3.0 to YouTube CSV."""
def __init__(self, dsrf2csv_arg):
""" Creating output file names """
self.dsrf_filename = dsrf2csv_arg
dsrf_path, filename = os.path.split(self.dsrf_filename)
print(dsrf_filename)
input("Press Enter to continue...")
self.report_outfilename = os.path.join(dsrf_path, filename.replace(
'DSR', 'Report').replace('tsv', 'csv'))
self.summary_outfilename = os.path.join(dsrf_path, filename.replace(
'DSR', 'Summary').replace('tsv.gz', 'csv'))
def parse_blocks(self, reader):
"""Generator for parsing all the blocks from the file.
Args:
reader: the handler of the input file
Yields:
block_lines: A full block as a list of rows.
"""
block_lines = []
current_block = None
for line in reader:
if line[0] in SKIP_ROWS:
continue
# Exit condition
if line[0] == 'FOOT':
yield block_lines
raise StopIteration()
line_block_number = int(line[1])
if current_block is None:
# Initialize
current_block = line_block_number
if line_block_number > current_block:
# End of block, yield and build a new one
yield block_lines
block_lines = []
current_block = line_block_number
block_lines.append(line)
# Also return last block
yield block_lines
def process_single_block(self, block):
"""Handles a single block in the DSR report.
Args:
block: Block as a list of lines.
Returns:
(summary_rows, report_row) tuple.
"""
views = 0
gross_revenue = 0
summary_rows = []
owners_data = {}
# Create an ordered dictionary with a key for every column.
report_row_dict = collections.OrderedDict(
[(column_name.lower(), '') for column_name in REPORT_HEAD])
for line in block:
if line[0] == 'SY02': # Save the financial Summary
summary_rows.append(line[1:])
continue
if line[0] == 'AS01': # Sound Recording information
report_row_dict['asset_id'] = line[3]
report_row_dict['asset_title'] = line[5]
report_row_dict['asset_artist'] = line[7]
report_row_dict['asset_isrc'] = line[4]
if line[0] == 'MW01': # Composition information
report_row_dict['mw_asset_id'] = line[2]
report_row_dict['mw_title'] = line[4]
report_row_dict['mw_iswc'] = line[3]
report_row_dict['mw_writers'] = line[6]
if line[0] == 'RU01': # Video level information
report_row_dict['video_ids'] = line[3]
report_row_dict['video_views'] = line[4]
if line[0] == 'SU03': # Usage data of Sound Recording Asset
# Summing up views and revenues for each sub-period
views += int(line[5])
gross_revenue += float(line[6])
report_row_dict['views'] = views
report_row_dict['gross_revenue'] = gross_revenue
if line[0] == 'LI01': # Ownership information
# if we already have parsed a LI01 line with that owner
if line[3] in owners_data:
# keep only the latest ownership
owners_data[line[3]]['ownership'] = line[6]
owners_data[line[3]]['amount_payable'] += float(line[9])
else:
# need to create the entry for that owner
data_dict = {'custom_id': line[5],
'ownership': line[6],
'amount_payable': float(line[9])}
owners_data[line[3]] = data_dict
# get rid of owners which do not have an ownership or an amount payable
owners_to_write = [o for o in owners_data
if (owners_data[o]['ownership'] > 0
and owners_data[o]['amount_payable'] > 0)]
report_row_dict['owner_name'] = '|'.join(owners_to_write)
report_row_dict['mw_custom_id'] = '|'.join([owners_data[o]
['custom_id']
for o in owners_to_write])
report_row_dict['ownership_claim'] = '|'.join([owners_data[o]
['ownership']
for o in owners_to_write])
report_row_dict['amount_payable'] = '|'.join([str(owners_data[o]
['amount_payable'])
for o in owners_to_write])
# Sanity check. The number of values must match the number of columns.
assert len(report_row_dict) == len(REPORT_HEAD), 'Row is wrong size :/'
return summary_rows, report_row_dict
def run(self):
finished = False
def removeFiles():
if not finished:
os.unlink(self.report_outfilename)
os.unlink(self.summary_outfilename)
atexit.register(removeFiles)
with gzip.open(self.dsrf_filename, 'rb') as dsr_file, gzip.open(
self.report_outfilename, 'wb') as report_file, open(
self.summary_outfilename, 'wb') as summary_file:
dsr_reader = csv.reader(dsr_file, delimiter='\t')
report_writer = csv.writer(report_file)
summary_writer = csv.writer(summary_file)
report_writer.writerow(REPORT_HEAD)
summary_writer.writerow(SUMMARY_HEAD)
for block in self.parse_blocks(dsr_reader):
summary_rows, report_row = self.process_single_block(block)
report_writer.writerow(report_row.values())
summary_writer.writerows(summary_rows)
finished = True
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(
description='Converts DDEX DSRF UGC profile reports to Standard CSV.')
required_args = arg_parser.add_argument_group('Required arguments')
required_args.add_argument('dsrf2csv_arg', type=str)
args = arg_parser.parse_args()
dsrf_converter = DsrfConverter(args.dsrf2csv_arg)
dsrf_converter.run()
In general to execute a python script in powershell like this .\script.py has two requirements:
Add the path to the python binaries to your %path%: $env:Path = $env:Path + ";C:\Path\to\python\binaries\"
Add the ending .py to the pathtext environment variable: $env:PATHEXT += ";.PY"
The latter will only be used in the current powershell session. If you want to add it to all future powershell sessions, add this line to your powershell profile (f.e. notepad $profile).
In your case there is also an issue with the python script you are trying to excute. def __init__(self) is an constructor for a class, like:
class Foo:
def __init__(self):
print "foo"
Did you give us your complete script?
Hello I have a network in a particular format, i.e. .gdf. However this is a text file in the following format
network:
nodedef>name VARCHAR,label VARCHAR
0,' 0 '
1,' 1 '
2,' 2 '
edgedef>node1 VARCHAR,node2 VARCHAR,weight DOUBLE
0,1,0.2
0,2,0.2
0,3,0.2
0,4,0.333333
where the first part refers to nodes and the second part to edges.
I want to add feature to read the file and add a feature to the nodes and return the following:
network:
nodedef>name VARCHAR,label VARCHAR, att1 VARCHAR
0,' 0 ', 'Paul'
1,' 1 ', 'Jack'
2,' 2 ', 'John'
edgedef>node1 VARCHAR,node2 VARCHAR,weight DOUBLE
0,1,0.2
0,2,0.2
0,3,0.2
0,4,0.333333
Here is some code that does the first half of what you asked for. It will parse the .GDF file and make the information available to you. Adding attributes and writing them is left as an exercise for the reader.
import ast
import collections
import re
def main():
parser = GDFParser()
with open('network.gdf') as file:
parser.read(file)
print(*parser.data, sep='\n')
def pivot(iterable):
columns = []
for row in iterable:
columns.extend([] for _ in range(len(row) - len(columns)))
for column, cell in zip(columns, row):
column.append(cell)
return columns
class GDFParser:
HEADER = re.compile('\w+:')
DEF = re.compile('\w+>\w+ (?:DOUBLE|VARCHAR)(?:,\w+ (?:DOUBLE|VARCHAR))*')
CAST = dict(DOUBLE=float, VARCHAR=str)
def __init__(self):
self.__header = None
self.__type = []
self.__data = []
#property
def header(self):
return self.__header
#property
def data(self):
return tuple(self.__data)
def read(self, file):
for line in file:
self.__read_line(line.strip())
def __read_line(self, line):
if self.HEADER.fullmatch(line):
self.__process_header(line)
elif self.DEF.fullmatch(line):
self.__process_def(line)
else:
self.__process_data(line)
def __process_header(self, line):
if self.header:
raise ValueError('header was previously set')
self.__header = line[:-1]
def __process_def(self, line):
name, fields = line.split('>')
columns, casts = pivot(field.split() for field in fields.split(','))
self.__type.append((collections.namedtuple(name, columns),
tuple(map(self.CAST.__getitem__, casts))))
def __process_data(self, line):
if not self.__type:
raise ValueError('a definition must come before its data')
kind, casts = self.__type[-1]
self.__data.append(kind(*(cast(item) for cast, item in
zip(casts, ast.literal_eval(line)))))
if __name__ == '__main__':
main()
I try to parse an xml file. The text which is in tags is parsed successfully (or it seems so) but I want to output as the text which is not contained in some tags and the following program just ignores it.
from xml.etree.ElementTree import XMLTreeBuilder
class HtmlLatex: # The target object of the parser
out = ''
var = ''
def start(self, tag, attrib): # Called for each opening tag.
pass
def end(self, tag): # Called for each closing tag.
if tag == 'i':
self.out += self.var
elif tag == 'sub':
self.out += '_{' + self.var + '}'
elif tag == 'sup':
self.out += '^{' + self.var + '}'
else:
self.out += self.var
def data(self, data):
self.var = data
def close(self):
print(self.out)
if __name__ == '__main__':
target = HtmlLatex()
parser = XMLTreeBuilder(target=target)
text = ''
with open('input.txt') as f1:
text = f1.read()
print(text)
parser.feed(text)
parser.close()
A part of the input I want to parse:
<p><i>p</i><sub>0</sub> = (<i>m</i><sup>3</sup>+(2<i>l</i><sub>2</sub>+<i>l</i><sub>1</sub>) <i>m</i><sup>2</sup>+(<i>l</i><sub>2</sub><sup>2</sup>+2<i>l</i><sub>1</sub> <i>l</i><sub>2</sub>+<i>l</i><sub>1</sub><sup>2</sup>) <i>m</i>) /(<i>m</i><sup>3</sup>+(3<i>l</i><sub>2</sub>+2<i>l</i><sub>1</sub>) ) }.</p>
Have a look at BeautifulSoup, a python library for parsing, navigating and manipulating html and xml. It has a handy interface and might solve your problem ...
Here's a pyparsing version - I hope the comments are sufficiently explanatory.
src = """<p><i>p</i><sub>0</sub> = (<i>m</i><sup>3</sup>+(2<i>l</i><sub>2</sub>+<i>l</i><sub>1</sub>) """ \
"""<i>m</i><sup>2</sup>+(<i>l</i><sub>2</sub><sup>2</sup>+2<i>l</i><sub>1</sub> <i>l</i><sub>2</sub>+""" \
"""<i>l</i><sub>1</sub><sup>2</sup>) <i>m</i>) /(<i>m</i><sup>3</sup>+(3<i>l</i><sub>2</sub>+""" \
"""2<i>l</i><sub>1</sub>) ) }.</p>"""
from pyparsing import makeHTMLTags, anyOpenTag, anyCloseTag, Suppress, replaceWith
# set up tag matching for <sub> and <sup> tags
SUB,endSUB = makeHTMLTags("sub")
SUP,endSUP = makeHTMLTags("sup")
# all other tags will be suppressed from the output
ANY,endANY = map(Suppress,(anyOpenTag,anyCloseTag))
SUB.setParseAction(replaceWith("_{"))
SUP.setParseAction(replaceWith("^{"))
endSUB.setParseAction(replaceWith("}"))
endSUP.setParseAction(replaceWith("}"))
transformer = (SUB | endSUB | SUP | endSUP | ANY | endANY)
# now use the transformer to apply these transforms to the input string
print transformer.transformString(src)
Gives
p_{0} = (m^{3}+(2l_{2}+l_{1}) m^{2}+(l_{2}^{2}+2l_{1} l_{2}+l_{1}^{2}) m) /(m^{3}+(3l_{2}+2l_{1}) ) }.