I am currently working on a project for which I need to download a few thousand citations from PubMed. I am currently using BioPython and have written this code:
from Bio import Entrez
from Bio import Medline
from pandas import *
from sys import argv
import os
Entrez.email = "my_email"
df = read_csv("my_file_path")
i=0
for index, row in df.iterrows():
print (row.id)
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=row.id)
records = Medline.parse(handle)
for record in records:
try:
abstract = str(record["AB"])
except:
abstract = "none"
try:
title = str(record["TI"])
except:
title = "none"
try:
mesh = str(record["MH"])
except:
mesh = "none"
path = 'my_file_path'
filename= str(row.id) + '.txt'
filename = os.path.join(path, filename)
file = open(filename, "w")
output = "title: "+str(title) + "\n\n" + "abstract: "+str(abstract) + "\n\n" + "mesh: "+str(mesh) + "\n\n"
file.write(output)
file.close()
print (i)
i=i+1
However, I receive the following error when this code is run:
Traceback (most recent call last):
File "my_file_path", line 13, in <module>
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=row.id)
File "/.../anaconda/lib/python3.5/site-packages/biopython-1.68-py3.5-macosx-10.6-x86_64.egg/Bio/Entrez/__init__.py", line 176, in efetch
if ids.count(",") >= 200:
AttributeError: 'numpy.int64' object has no attribute 'count'
Here are the first few columns of the CSV file:
id
10029645
10073846
10078088
10080457
10088066
...
Your error is at
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=row.id)
From the documentation
id
UID list. Either a single UID or a comma-delimited list of UIDs
From the examples I see, id is a string, not a numpy.int64 out of a pandas dataframe. You should convert that row.id to a string
Related
I'm developing too to compare database schema of Test and Prod database.
I can succesfully compare schema and print to command line.
However I don't know how to store results to JSON, CSV file or any file. Please advice!
from pprint import pprint
from sqlalchemydiff import compare
from sqlalchemy.engine import URL
import pyodbc
import time
# Pass through Pyodbc string
conn_string_dw_test = "DRIVER=..."
conn_string_dw_prod = "DRIVER=..."
connection_url_dw_test = URL.create("mssql+pyodbc", query={"odbc_connect": conn_string_dw_test})
connection_url_dw_prod = URL.create("mssql+pyodbc", query={"odbc_connect": conn_string_dw_prod})
print('')
print('-----SCHEMA COMPARE FOR TEST AND PROD DW-----')
result_dw = compare(connection_url_dw_test, connection_url_dw_prod)
if result_dw.is_match:
print('')
print('DW Schemas are identical')
print('')
else:
print('')
print('We detected following differences')
print('DW Test is on Left. DW Prod is on Right')
print('')
pprint(result_dw.errors)
# Export CSV
filename = "SchemaCompareReports\SchemaCompareReport_" + time.strftime("%Y%m%d-%H%M%S") + ".csv"
result_dw.to_csv(filename) # NOT WORKING
print("Report exported: " + filename)
ERROR in first try:
traceback (most recent call last):
File ".\SchemaComparePOC.py", line 74, in
result_dw.to_csv(filename)
AttributeError: 'CompareResult' object has no attribute 'to_csv'
I also tried in second try to save results to json file, but got error:
filename = "SchemaCompareReport_DW_" + time.strftime("%Y%m%d-%H%M%S") + ".json"
a_file = open(filename, "w")
json.dump(result_dw.dump_errors, a_file)
a_file.close()
Error of second try:
Traceback (most recent call last):
File "./SchemaComparePOC.py", line 106, in <module>
json.dump(result_dw.dump_errors, a_file)
File "C:\Python\Python3.8.9\lib\json\__init__.py", line 179, in dump
for chunk in iterable:
File "C:\Python\Python3.8.9\lib\json\encoder.py", line 438, in _iterencode
o = _default(o)
File "C:\Python\Python3.8.9\lib\json\encoder.py", line 179, in default
raise TypeError(f'Object of type {o.__class__.__name__} '
TypeError: Object of type method is not JSON serializable
In third try I got no error, but file was empty:
filename = "SchemaCompareReport" + time.strftime("%Y%m%d-%H%M%S") + ".json"
a_file = open(filename, "w")
json.dump(result_dw.dump_errors.__dict__, a_file)
a_file.close()
I have a json file out of which one field has list data as shown below
{
"broker_address":"0.0.0.0",
"serial_id": "YYMMSSSSSSVV",
"auto_foc": true,
"timer": [0,23,30]
}
I am taking user input for timer field so I want to replace the timer data with the input value received from user. On trying it I am getting following error
Traceback (most recent call last):
File "test.py", line 23, in <module>
time, final)
TypeError: Can't convert 'list' object to str implicitly
My code snippet is as follows
import json
import os
import time
val = input("Enter your value: ")
print(val)
str1 = " "
with open('/home/pi/config.json', 'r+') as filer:
print("file read")
az = filer.read()
print(az)
read_file = az.rstrip('/n')
data = json.loads(read_file)
#print("printing file",json.loads(read_file))
time=data["timer"]
#print(read_file)
print(time)
print("Waiting.....................")
#time.sleep(2)
final = str(val)
print(final)
read_file = read_file.replace(
time, final)
with open('/home/pi/config.json', 'w') as filer:
filer.write(read_file)
Please let me know how to resolve this error.
Try this:
import json
import os
import time
val = input("Enter your value: ")
print(val)
str1 = " "
with open('/home/pi/config.json', 'r+') as filer:
print("file read")
az = filer.read()
print(az)
read_file = az.rstrip('/n')
data = json.loads(read_file)
#print("printing file",json.loads(read_file))
time=data["timer"]
#print(read_file)
print(time)
print("Waiting.....................")
#time.sleep(2)
final = str(val).split()
final = [int(i) for i in final]
print(final)
print(str(time))
read_file = read_file.replace(str(time), str(final))
print(read_file)
with open('/home/pi/config.json', 'w') as filer:
filer.write(read_file)
And update the json file from "timer": [0,23,30] to "timer": [0, 23, 30] i.e. add spaces
One thing avoid using the name time when you use that name as a variable it will replace the imported name time. Secondly the issue is that the data is a list not a string and replace is expecting a string not a list.
What you will want to do is just take advantage of json for what it is a serializer and deserializer and modify the data itself and use json to write it to a file. This also ensures you will be able to read it back out as json.
import json
import os
import time
val = input("Enter your value: ")
print(val)
str1 = " "
with open('/home/pi/config.json', 'r+') as filer:
print("file read")
data = json.load(filer)
timer=data["timer"] #DONT USE time
print(timer)
print("Waiting.....................")
#time.sleep(2)
final = str(val)
print(final)
#This next part will be up to you,
# do you want this to be the list it was before?
# or the string as input
data['timer'] = final
with open('/home/pi/config.json', 'w') as filer:
json.dump(data,filer)
I would like load data which are 10 categories of document, each cateory contains text files, but I keep getting the following error:
IndexError: list index out of range
THis is code :
def load_data(folder):
data = []
files = [join(folder, x) for x in os.listdir(folder)]
for file in files:
topic = file.split("/")[9] # this is where the error occurs
label = topic.replace(" ", "_")
name = "__label__" + label
with open(file, "rb") as f:
content = f.read()
content = content.decode('utf-16')
content = " ".join(i for i in content.split())
data.append(name + " " + content)
return data
Easy way to debug this would be to add print statements and check what the objects hold. For e.g. in this case, you can add 2 print statements at the beginning of the for loop. This would help you to figure out why you are getting IndexError
def load_data(folder):
data = []
files = [join(folder, x) for x in os.listdir(folder)]
for file in files:
print(file)
print(file.split("/"))
topic = file.split("/")[9] # this is where the error occurs
label = topic.replace(" ", "_")
name = "__label__" + label
with open(file, "rb") as f:
content = f.read()
content = content.decode('utf-16')
content = " ".join(i for i in content.split())
data.append(name + " " + content)
return data
I am trying to extract values from json ld to csv as they are in the file. There are a couple of issues I am facing.
1. The values being read for different fields are getting truncated in most of the cases. In the remaining cases the value of some other field is appearing in some other field.
2. I am also getting an error - 'Additional data' after some 4,000 lines.
The file is quite big(half a gb). I am attaching a shortened version of my code. Please tell me where am I going wrong.
The input file - I have shortened it and kept it here. There was no way of putting it here.
https://github.com/Architsi/json-ld-issue
I tried writing this script and I tried multiple online converters too
import csv, sys, math, operator, re, os, json, ijson
from pprint import pprint
filelist = []
for file in os.listdir("."):
if file.endswith(".json"):
filelist.append(file)
for input in filelist:
newCsv = []
splitlist = input.split(".")
output = splitlist[0] + '.csv'
newFile = open(output, 'w', newline='') #wb for windows, else you'll see newlines added to csv
# initialize csv writer
writer = csv.writer(newFile)
#Name of the columns
header_row = ('Format', 'Description', 'Object', 'DataProvider')
writer.writerow(header_row)
with open(input, encoding="utf8") as json_file:
data = ijson.items(json_file, 'item')
#passing all the values through try except
for s in data:
source = s['_source']
try:
source_resource = source['sourceResource']
except:
print ("Warning: No source resource in record ID: " + id)
try:
data_provider = source['dataProvider'].encode()
except:
data_provider = "N/A"
try:
_object = source['object'].encode()
except:
_object = "N/A"
try:
descriptions = source_resource['description']
string = ""
for item in descriptions:
if len(descriptions) > 1:
description = item.encode() #+ " | "
else:
description = item.encode()
string = string + description
description = string.encode()
except:
description = "N/A"
created = ""
#writing it to csv
write_tuple = ('format', description, _object, data_provider)
writer.writerow(write_tuple)
print ("File written to " + output)
newFile.close()
The error that I am getting is this- raise common.JSONError('Additional Data')
Expected result is a csv file with all the columns and correct values
Please look at the following code:
from xml.dom import minidom
xmldoc = minidom.parse("C:\Users\...\xml") #This is just the address to the document
soccerfeed = xmldoc.getElementsByTagName("SoccerFeed")[0]
soccerdocument = soccerfeed.getElementsByTagName("SoccerDocument")[0]
competition = soccerdocument.getElementsByTagName("Competition")[0]
country = competition.getElementsByTagName("Country")[0].firstChild.data
name = competition.getElementsByTagName("Name")[0].firstChild.data
season = competition.getElementsByTagName("Stat")[1].firstChild.data
matchday = competition.getElementsByTagName('Stat')[3].firstChild.data
lst = [country, name, season, "matchday: "+ matchday]
print lst
#Match Data
MatchData = soccerdocument.getElementsByTagName("MatchData")[0]
for MatchInfo in MatchData:
MatchInfo = MatchData.getElementsByTagName("MatchInfo")[0]
Attendance = MatchInfo.getElementsByTagName("Attendance")[0].firstChild.data
Result = MatchInfo.getElementsByTagName("Result")[0]
print (MatchInfo, "Attendance: "+ Attendance)
So I just wrote this code to parse some data from a xml file. I keep getting the following error:
Traceback (most recent call last):
File "C:\Users\Javi\Desktop\csvfile.py", line 28, in <module>
for MatchInfo in MatchData:
TypeError: iteration over non-sequence
How do I fix this?
Loop over return value of getElementsByTagName.
Replace following line
MatchData = soccerdocument.getElementsByTagName("MatchData")[0]
to
MatchData = soccerdocument.getElementsByTagName("MatchData")