I'm trying to parse a MARC file downloaded from the Library of Congress. I've successfully downloaded the record using the PyZ3950, but when I try to parse the file using PyMarc, I get the following error:
Traceback (most recent call last):
File "test.py", line 13, in <module>
for record in reader:
File "build/bdist.macosx-10.9-intel/egg/pymarc/reader.py", line 83, in next
ValueError: invalid literal for int() with base 10: '<PyZ3'
And here is my full code:
from PyZ3950 import zoom, zmarc
from pymarc import MARCReader
conn = zoom.Connection('z3950.loc.gov', 7090)
conn.databaseName = 'VOYAGER'
conn.preferredRecordSyntax = 'USMARC'
query = zoom.Query('CCL', 'ti="1066 and all that"')
res = conn.search(query)
reader = MARCReader(str(res))
for record in reader:
print record.title()
conn.close()
Your statement:
res = conn.search(query)
return a ResultSet, accordingly to http://www.panix.com/~asl2/software/PyZ3950/zoom.html
Each record r in the resultSet have the data in r.data
So, you have to feed the MARCReader with each r.data or with them all concatenated.
This will work:
from PyZ3950 import zoom, zmarc
from pymarc import MARCReader
conn = zoom.Connection('z3950.loc.gov', 7090)
conn.databaseName = 'VOYAGER'
conn.preferredRecordSyntax = 'USMARC'
query = zoom.Query('CCL', 'ti="1066 and all that"')
res = conn.search(query)
marc = ''
for r in res:
marc = marc + r.data
reader = MARCReader(marc)
for record in reader:
print record.title()
conn.close()
Related
I am working on AZURE Cognitive API Search. While getting the result from API, I want to write it into a new JSON File. I tried to access the analyse_result variable with the line but it does not work. It shows that the object is not JSON Serializable. My code is-
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
from array import array
import os
from PIL import Image
import sys
import time
import json
import csv
subscription_key = ""
endpoint = ""
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
def azure_ocr_api(): #image_url
local_image_url = r"E:\Bank of Baroda\BOB IMAGE\Cheque309086.jpeg"
# read_response = computervision_client.read_in_stream(open("./Images/" + image_url,'rb'), raw=True)
read_response = computervision_client.read_in_stream(open(local_image_url,'rb'), raw=True)
# Get the operation location (URL with an ID at the end) from the response
read_operation_location = read_response.headers["Operation-Location"]
# Grab the ID from the URL
operation_id = read_operation_location.split("/")[-1]
# Call the "GET" API and wait for it to retrieve the results
while True:
read_result = computervision_client.get_read_result(operation_id)
if read_result.status not in ['notStarted', 'running']:
break
time.sleep(1)
list = []
if read_result.status == OperationStatusCodes.succeeded:
for text_result in read_result.analyze_result.read_results:
for line in text_result.lines:
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(line, f, ensure_ascii=False, indent=4)
# print(list)
# pass
# return list
azure_ocr_api()
print("End of Computer Vision quickstart.")
The code shows a error like this -
Traceback (most recent call last):
File "e:\Bank of Baroda\m.py", line 44, in <module>
azure_ocr_api()
File "e:\Bank of Baroda\m.py", line 40, in azure_ocr_api
json.dump(line, f, ensure_ascii=False, indent=4)
File "C:\Users\Clasher\anaconda3\lib\json\__init__.py", line 179, in dump
for chunk in iterable:
File "C:\Users\Clasher\anaconda3\lib\json\encoder.py", line 438, in _iterencode
o = _default(o)
File "C:\Users\Clasher\anaconda3\lib\json\encoder.py", line 179, in default
TypeError: Object of type Line is not JSON serializable
Please help.
I'm developing too to compare database schema of Test and Prod database.
I can succesfully compare schema and print to command line.
However I don't know how to store results to JSON, CSV file or any file. Please advice!
from pprint import pprint
from sqlalchemydiff import compare
from sqlalchemy.engine import URL
import pyodbc
import time
# Pass through Pyodbc string
conn_string_dw_test = "DRIVER=..."
conn_string_dw_prod = "DRIVER=..."
connection_url_dw_test = URL.create("mssql+pyodbc", query={"odbc_connect": conn_string_dw_test})
connection_url_dw_prod = URL.create("mssql+pyodbc", query={"odbc_connect": conn_string_dw_prod})
print('')
print('-----SCHEMA COMPARE FOR TEST AND PROD DW-----')
result_dw = compare(connection_url_dw_test, connection_url_dw_prod)
if result_dw.is_match:
print('')
print('DW Schemas are identical')
print('')
else:
print('')
print('We detected following differences')
print('DW Test is on Left. DW Prod is on Right')
print('')
pprint(result_dw.errors)
# Export CSV
filename = "SchemaCompareReports\SchemaCompareReport_" + time.strftime("%Y%m%d-%H%M%S") + ".csv"
result_dw.to_csv(filename) # NOT WORKING
print("Report exported: " + filename)
ERROR in first try:
traceback (most recent call last):
File ".\SchemaComparePOC.py", line 74, in
result_dw.to_csv(filename)
AttributeError: 'CompareResult' object has no attribute 'to_csv'
I also tried in second try to save results to json file, but got error:
filename = "SchemaCompareReport_DW_" + time.strftime("%Y%m%d-%H%M%S") + ".json"
a_file = open(filename, "w")
json.dump(result_dw.dump_errors, a_file)
a_file.close()
Error of second try:
Traceback (most recent call last):
File "./SchemaComparePOC.py", line 106, in <module>
json.dump(result_dw.dump_errors, a_file)
File "C:\Python\Python3.8.9\lib\json\__init__.py", line 179, in dump
for chunk in iterable:
File "C:\Python\Python3.8.9\lib\json\encoder.py", line 438, in _iterencode
o = _default(o)
File "C:\Python\Python3.8.9\lib\json\encoder.py", line 179, in default
raise TypeError(f'Object of type {o.__class__.__name__} '
TypeError: Object of type method is not JSON serializable
In third try I got no error, but file was empty:
filename = "SchemaCompareReport" + time.strftime("%Y%m%d-%H%M%S") + ".json"
a_file = open(filename, "w")
json.dump(result_dw.dump_errors.__dict__, a_file)
a_file.close()
I am trying to save the query results from postgresql into a csv file but the csv file sometimes lacks the headers but writes all the details of the queries.
import psycopg2
import csv
try:
conn = psycopg2.connect(database = '', user = '', host = '', password = '')
except:
print ("I am unable to connect to the database")
cursor = conn.cursor()
query = """select * from"""
cursor.execute(query)
result = cursor.fetchall()
with open("kiker.csv","wb") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = ["Builder", "Subdivision", "Spec", "Build", "Cancel", "Price", "Sq_Ft", "PPSF", "Realtor", "Project ID"], extrasaction = 'ignore')
writer.writeheader()
writer.writerow(result)
print "Query 1 Created"
Error:
Traceback (most recent call last):
File "C:\\connecting.py", line 45, in <module>
writer.writerow(result)
File "C:\Python27\lib\csv.py", line 152, in writerow
return self.writer.writerow(self._dict_to_list(rowdict))
File "C:\Python27\lib\csv.py", line 149, in _dict_to_list
return [rowdict.get(key, self.restval) for key in self.fieldnames]
AttributeError: 'list' object has no attribute 'get'
I tried both the methods below, but both of them fail to include the header information from postgresql.
c = csv.writer(open("kiker.csv","wb"))
for row in result:
c.writerow(row)
and
fp = open("kiker.csv","wb")
myFile = csv.writer(fp)
myFile.writerows(result)
fp.close()
How can I fix this?
I used Pandas to get around the situation. Worked like a treat.
cursor.execute(query)
result = cursor.fetchall()
first = pd.DataFrame(result, columns = ["Builder","Subdivision","Spec","Build","Cancel","Price","Sq_Ft","PPSF","Realtor","Project ID"])
first.to_csv("kiker.csv",index = False)
DictWriter expects dicts, not tuples: https://docs.python.org/3.6/library/csv.html#writer-objects
I am currently working on a project for which I need to download a few thousand citations from PubMed. I am currently using BioPython and have written this code:
from Bio import Entrez
from Bio import Medline
from pandas import *
from sys import argv
import os
Entrez.email = "my_email"
df = read_csv("my_file_path")
i=0
for index, row in df.iterrows():
print (row.id)
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=row.id)
records = Medline.parse(handle)
for record in records:
try:
abstract = str(record["AB"])
except:
abstract = "none"
try:
title = str(record["TI"])
except:
title = "none"
try:
mesh = str(record["MH"])
except:
mesh = "none"
path = 'my_file_path'
filename= str(row.id) + '.txt'
filename = os.path.join(path, filename)
file = open(filename, "w")
output = "title: "+str(title) + "\n\n" + "abstract: "+str(abstract) + "\n\n" + "mesh: "+str(mesh) + "\n\n"
file.write(output)
file.close()
print (i)
i=i+1
However, I receive the following error when this code is run:
Traceback (most recent call last):
File "my_file_path", line 13, in <module>
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=row.id)
File "/.../anaconda/lib/python3.5/site-packages/biopython-1.68-py3.5-macosx-10.6-x86_64.egg/Bio/Entrez/__init__.py", line 176, in efetch
if ids.count(",") >= 200:
AttributeError: 'numpy.int64' object has no attribute 'count'
Here are the first few columns of the CSV file:
id
10029645
10073846
10078088
10080457
10088066
...
Your error is at
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=row.id)
From the documentation
id
UID list. Either a single UID or a comma-delimited list of UIDs
From the examples I see, id is a string, not a numpy.int64 out of a pandas dataframe. You should convert that row.id to a string
Please look at the following code:
from xml.dom import minidom
xmldoc = minidom.parse("C:\Users\...\xml") #This is just the address to the document
soccerfeed = xmldoc.getElementsByTagName("SoccerFeed")[0]
soccerdocument = soccerfeed.getElementsByTagName("SoccerDocument")[0]
competition = soccerdocument.getElementsByTagName("Competition")[0]
country = competition.getElementsByTagName("Country")[0].firstChild.data
name = competition.getElementsByTagName("Name")[0].firstChild.data
season = competition.getElementsByTagName("Stat")[1].firstChild.data
matchday = competition.getElementsByTagName('Stat')[3].firstChild.data
lst = [country, name, season, "matchday: "+ matchday]
print lst
#Match Data
MatchData = soccerdocument.getElementsByTagName("MatchData")[0]
for MatchInfo in MatchData:
MatchInfo = MatchData.getElementsByTagName("MatchInfo")[0]
Attendance = MatchInfo.getElementsByTagName("Attendance")[0].firstChild.data
Result = MatchInfo.getElementsByTagName("Result")[0]
print (MatchInfo, "Attendance: "+ Attendance)
So I just wrote this code to parse some data from a xml file. I keep getting the following error:
Traceback (most recent call last):
File "C:\Users\Javi\Desktop\csvfile.py", line 28, in <module>
for MatchInfo in MatchData:
TypeError: iteration over non-sequence
How do I fix this?
Loop over return value of getElementsByTagName.
Replace following line
MatchData = soccerdocument.getElementsByTagName("MatchData")[0]
to
MatchData = soccerdocument.getElementsByTagName("MatchData")