Creating loops from xml data - python

Please look at the following code:
from xml.dom import minidom
xmldoc = minidom.parse("C:\Users\...\xml") #This is just the address to the document
soccerfeed = xmldoc.getElementsByTagName("SoccerFeed")[0]
soccerdocument = soccerfeed.getElementsByTagName("SoccerDocument")[0]
competition = soccerdocument.getElementsByTagName("Competition")[0]
country = competition.getElementsByTagName("Country")[0].firstChild.data
name = competition.getElementsByTagName("Name")[0].firstChild.data
season = competition.getElementsByTagName("Stat")[1].firstChild.data
matchday = competition.getElementsByTagName('Stat')[3].firstChild.data
lst = [country, name, season, "matchday: "+ matchday]
print lst
#Match Data
MatchData = soccerdocument.getElementsByTagName("MatchData")[0]
for MatchInfo in MatchData:
MatchInfo = MatchData.getElementsByTagName("MatchInfo")[0]
Attendance = MatchInfo.getElementsByTagName("Attendance")[0].firstChild.data
Result = MatchInfo.getElementsByTagName("Result")[0]
print (MatchInfo, "Attendance: "+ Attendance)
So I just wrote this code to parse some data from a xml file. I keep getting the following error:
Traceback (most recent call last):
File "C:\Users\Javi\Desktop\csvfile.py", line 28, in <module>
for MatchInfo in MatchData:
TypeError: iteration over non-sequence
How do I fix this?

Loop over return value of getElementsByTagName.
Replace following line
MatchData = soccerdocument.getElementsByTagName("MatchData")[0]
to
MatchData = soccerdocument.getElementsByTagName("MatchData")

Related

python error: Traceback (most recent call last), IndexError: list index out of range

I'm trying to run the below python script (vcf2treemix.py) with the command
<./vcf2treemix.py -vcf allsamples14_filtered_1_autosomes38_bisnps.vcf.gz -pop allsamples14.clust.pop>
I got this error with both python 2 and 3
######### error ###
Traceback (most recent call last):
File "./vcf2treemix.py", line 99, in <module>
main()
File "./vcf2treemix.py", line 95, in main
pop_obj = get_pops(pop_file)
File "./vcf2treemix.py", line 34, in get_pops
pops[fields[0]] = fields[1].split()
IndexError: list index out of range
######### vcf2treemix.py ###
#!/usr/bin/python
# vcf2treemix.py
# Converts a vcf file into TreeMix input
import argparse
from collections import OrderedDict
parser = argparse.ArgumentParser(description="Parsing statistical output of"
" VCFtools")
parser.add_argument("-vcf", dest="vcf_file", help="/mnt/ursus/GROUP-sbifh3/c1845371/whole_genome/data_dog/align_out/treemix/allsamples14_filtered_1_autosomes38_bisnps_main.vcf.gz",
required=True)
parser.add_argument("-pop", dest="pop_file", help="/mnt/ursus/GROUP-sbifh3/c1845371/whole_genome/data_dog/align_out/treemix/allsamples14.clust.pop",
required=True)
arg = parser.parse_args()
def get_pops(pop_file):
"""
Returns a dictionary with pop identifier as key and taxa as a list of
strings. In the pop file, each populations should be in one line, starting
withe pop name, a colon and the corresponding taxa separated by whitespace.
E.g.:
pop1: taxon1 taxon2 taxon3
"""
pops = OrderedDict()
with open(pop_file) as fh:
for line in fh:
fields = line.strip().split(":")
pops[fields[0]] = fields[1].split()
return pops
def vcf2treemix(vcf_file, pop_obj):
"""
Converts a vcf file into treemix format.
"""
vcf_fh = open(vcf_file)
output_name = vcf_file.strip(".vcf") + ".tmix"
output_fh = open(output_name, "w")
# Write header for tmix file
output_fh.write("{}\n".format(" ".join([x for x in pop_obj.keys()])))
for line in vcf_fh:
# Skip header
if line.startswith("##"):
pass
# Get taxon positions
elif line.startswith("#CHROM"):
taxa_pos = line.strip().split()
# Ignore empty lines
elif line.strip() != "":
fields = line.strip().split()
# Ignore loci with more than two alleles
if len(fields[4]) > 1:
continue
# Get allele counts for each populations
temp_pop = OrderedDict((x, [0,0]) for x in pop_obj.keys())
for pop, taxa in pop_obj.items():
for taxon in taxa:
# Get taxon genotype
gen = fields[taxa_pos.index(taxon)]
# Skip if gen is missing data
if gen == "./.":
continue
temp_pop[pop][0] += gen.count("0")
temp_pop[pop][1] += gen.count("1")
# Write current locus to file
output_fh.write("{}\n".format(" ".join([str(x[0]) + "," + str(x[1]) for x in temp_pop.values()])))
vcf_fh.close()
output_fh.close()
def main():
# Args
vcf_file = arg.vcf_file
pop_file = arg.pop_file
pop_obj = get_pops(pop_file)
vcf2treemix(vcf_file, pop_obj)
main()
I have zero experience with python and I just run the script to manipulate genetic data.
Any help will be highly appreciable.
Thanks
Ali
I tried python 2 and 3 and I expect the script to work straightforward. I think there is no problem with the input data.

How can I append a text file's variables values in the corresponding index of a 2D array/list?

Code:
opener = open("gymclub.txt", "r")
reader = opener.readline()
listPressups = [["",],["",],["",],["",],["",],["",],["",],["",],["",],["",],["",],["",],["",]]
while reader!="":
splitting=reader.split(",")
name = splitting[0]
press_ups = splitting[1]
pull_ups = splitting[2]
reader = opener.readline()
for x in range(1,12):
listPressups[0][x].append(int(press_ups))
listPressups.sort(reverse=True)
print(listPressups)
Output:
Traceback (most recent call last):
File "C:/Users/Nutzer/Desktop/Python/practice_NEA/index.py", line 36, in <module>
listPressups[0][x].append(int(press_ups))
IndexError: list index out of range
Desired Output:
[["",75],["",74],["",73],["",67],["",66],["",58],["",45],["",33],["",30],["",25],["",10],["",8]]
What method can I use to reach my desired output?
The text file I used:
Try this:
opener = open("gymclub.txt", "r")
listPressups = []
for line in opener.readlines():
press_ups = int(line.split(",")[1])
listPressups.append(["", press_ups])
listPressups.sort(reverse=True)
opener.close()
print(listPressups)
Instead of
listPressups[0][x].append(int(press_ups))
It should be
listPressups[x][1].append(int(press_ups))
You could just start with an empty array, here: listPressups and append with just a while loop as shown below.
opener = open("gymclub.txt", "r")
reader = opener.readline()
#listPressups = [["",],["",],["",],["",],["",],["",],["",],["",],["",],["",],["",],["",],["",]]
listPressups = []
while reader!="":
splitting=reader.split(",")
name = splitting[0]
press_ups = splitting[1]
pull_ups = splitting[2]
reader = opener.readline()
listPressups.append(["",int(press_ups)]) #Here we append an empty string with each value
listPressups.sort(reverse=True)
print(listPressups)

BioPython Count Error

I am currently working on a project for which I need to download a few thousand citations from PubMed. I am currently using BioPython and have written this code:
from Bio import Entrez
from Bio import Medline
from pandas import *
from sys import argv
import os
Entrez.email = "my_email"
df = read_csv("my_file_path")
i=0
for index, row in df.iterrows():
print (row.id)
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=row.id)
records = Medline.parse(handle)
for record in records:
try:
abstract = str(record["AB"])
except:
abstract = "none"
try:
title = str(record["TI"])
except:
title = "none"
try:
mesh = str(record["MH"])
except:
mesh = "none"
path = 'my_file_path'
filename= str(row.id) + '.txt'
filename = os.path.join(path, filename)
file = open(filename, "w")
output = "title: "+str(title) + "\n\n" + "abstract: "+str(abstract) + "\n\n" + "mesh: "+str(mesh) + "\n\n"
file.write(output)
file.close()
print (i)
i=i+1
However, I receive the following error when this code is run:
Traceback (most recent call last):
File "my_file_path", line 13, in <module>
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=row.id)
File "/.../anaconda/lib/python3.5/site-packages/biopython-1.68-py3.5-macosx-10.6-x86_64.egg/Bio/Entrez/__init__.py", line 176, in efetch
if ids.count(",") >= 200:
AttributeError: 'numpy.int64' object has no attribute 'count'
Here are the first few columns of the CSV file:
id
10029645
10073846
10078088
10080457
10088066
...
Your error is at
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=row.id)
From the documentation
id
UID list. Either a single UID or a comma-delimited list of UIDs
From the examples I see, id is a string, not a numpy.int64 out of a pandas dataframe. You should convert that row.id to a string

PyMarc Invalid Literal Error

I'm trying to parse a MARC file downloaded from the Library of Congress. I've successfully downloaded the record using the PyZ3950, but when I try to parse the file using PyMarc, I get the following error:
Traceback (most recent call last):
File "test.py", line 13, in <module>
for record in reader:
File "build/bdist.macosx-10.9-intel/egg/pymarc/reader.py", line 83, in next
ValueError: invalid literal for int() with base 10: '<PyZ3'
And here is my full code:
from PyZ3950 import zoom, zmarc
from pymarc import MARCReader
conn = zoom.Connection('z3950.loc.gov', 7090)
conn.databaseName = 'VOYAGER'
conn.preferredRecordSyntax = 'USMARC'
query = zoom.Query('CCL', 'ti="1066 and all that"')
res = conn.search(query)
reader = MARCReader(str(res))
for record in reader:
print record.title()
conn.close()
Your statement:
res = conn.search(query)
return a ResultSet, accordingly to http://www.panix.com/~asl2/software/PyZ3950/zoom.html
Each record r in the resultSet have the data in r.data
So, you have to feed the MARCReader with each r.data or with them all concatenated.
This will work:
from PyZ3950 import zoom, zmarc
from pymarc import MARCReader
conn = zoom.Connection('z3950.loc.gov', 7090)
conn.databaseName = 'VOYAGER'
conn.preferredRecordSyntax = 'USMARC'
query = zoom.Query('CCL', 'ti="1066 and all that"')
res = conn.search(query)
marc = ''
for r in res:
marc = marc + r.data
reader = MARCReader(marc)
for record in reader:
print record.title()
conn.close()

extract tweets from a text file (python)

Sorry, I am just trying to store 'id_str' from each tweet to a new list called ids[]..
but getting the following error:
Traceback (most recent call last):
File "extract_tweet.py", line 17, in
print tweet['id_str']
KeyError: 'id_str'
My code is:
import json
import sys
if __name__ == '__main__':
tweets = []
for line in open (sys.argv[1]):
try:
tweets.append(json.loads(line))
except:
pass
ids = []
for tweet in tweets:
ids.append(tweet['id_str'])
The json data from tweets are sometimes missing fields. Try something like this,
ids = []
for tweet in tweets:
if 'id_str' in tweet:
ids.append(tweet['id_str'])
or equivalently,
ids = [tweet['id_str'] for tweet in tweets if 'id_str' in tweet]
import json
tweets = []
tweets.append(
json.loads('{"a": 1}')
)
tweet = tweets[0]
print(tweet)
print( tweet['id_str'] )
--output:--
{'a': 1}
Traceback (most recent call last):
File "1.py", line 9, in <module>
print( tweet['id_str'] )
KeyError: 'id_str'
And:
my_dict = {u"id_str": 1}
print my_dict["id_str"]
--output:--
1

Categories

Resources