combine several functions in Python - python

I do RSS parsing and get news from news headlines
def print_headlines_test(rss_dict):
for key,url in rss_dict.items():
feed = feedparser.parse(url)
headlines = []
allheadlines = []
for newsitem in feed['items']:
headlines.append(newsitem['title'])
for key,url in rss_dict.items():
allheadlines.extend(headlines)
Then i'm saving this to csv and read df:
def write_and_read():
header = ['Tittle' , 'Desc']
with open('news.csv', 'w', encoding='utf-8-sig') as csvfile:
writer.writerow(i for i in header)
for a in zip(allheadlines):
writer.writerow((a))
df = pd.read_csv('news.csv')
Then i'm searching news by certain targets (t & t2):
t = 'word1|word2|word3'
t2 = 'word3|word4|word5'
And making dataframe of this:
def certain_words(t, t2):
result = df.apply(lambda x: x.str.contains(t, na=False,
flags = re.IGNORECASE, regex=True)).any(axis=1)
result2 = df.apply(lambda x: x.str.contains(t2, na=False,
flags = re.IGNORECASE, regex=True)).any(axis=1)
df[result&result2]
So, my input values is rss_dict (dictionary of rss with format {'rss-name':'rss-link'} and two targets (t,t2)
Now my question. How i should combine all of this functions to something (function or maybe class) in order to set these three values (rss_dict, t, t2) and so that my code runs immediately?

You can use a class with all these functions included inside as follows:
class News:
def __init__(self,rss_dict,t, t2):
rss_dict=self.rss_dict
t=self.t
t2=self.t2
def print_headlines_test(self):
for key,url in self.rss_dict.items():
feed = feedparser.parse(url)
headlines = []
allheadlines = []
for newsitem in feed['items']:
headlines.append(newsitem['title'])
for key,url in self.rss_dict.items():
allheadlines.extend(headlines)
self.allheadlines=allheadlines
def write_and_read(self):
header = ['Tittle' , 'Desc']
with open('news.csv', 'w', encoding='utf-8-sig') as csvfile:
writer.writerow(i for i in header)
for a in zip(self.allheadlines):
writer.writerow((a))
df = pd.read_csv('news.csv')
def certain_words(self):
result = df.apply(lambda x: x.str.contains(self.t, na=False,
flags = re.IGNORECASE, regex=True)).any(axis=1)
result2 = df.apply(lambda x: x.str.contains(self.t2, na=False,
flags = re.IGNORECASE, regex=True)).any(axis=1)
df[result&result2]
You should pass the parameters by creating an object of the class and the functions has to be called using the created object for them to run

Related

problem with creating column name dynamically in a dataframe

I am trying to create a pandas dataframe dynamically. So far I'm fine with capturing data within the dataframe, but not with the name of the columns.
I also want the name of the columns to be based on the 'category' of the data that comes with the record that I am reading in my function, but I always get the last one.
def funct_example(client):
documents = [ v_document ]
poller = client.begin_analyze_entities(documents)
result = poller.result()
docs = [doc for doc in result if not doc.is_error]
i = 1
df_final = pd.DataFrame()
for idx, doc in enumerate(docs):
for relation in doc.entity_relations:
for role in relation.roles:
name = str([format(entity.category)]) + str(i) # <---- THIS LINE ALWAYS IS THE LAST REGISTER
d = {name : "'{}' with entity '{}'".format(role.name, role.entity.text)} # <---THIS IS OK
df = pd.DataFrame(data=d, index=[0])
df_final = pd.concat([df_final, df], axis=1)
i = i + 1
display(df_final)
return(df_final)
df_new_2 = funct_example(client)
I've tried adding an extra loop between creating the dataframe sentence and concat function like so:
for col in df.columns:
name = str([format(entity.category)]) + str(i)
df = df.rename(columns={col: name })
But the last category still appears in the column name...
How can I solve that?
From already thank you very much.
SOLUTION:
for idx, doc in enumerate(docs):
for relation in doc.entity_relations:
for role in relation.roles:
name = 'Relation_' + format(relation.relation_type) + '_' + str(i)
d = {name : "'{}' with entity '{}'".format(role.name, role.entity.text)}
df = pd.DataFrame(data=d, index=[0])
df_final = pd.concat([df_final, df], axis=1)
i = i + 1
display(df_final)
return(df_final)
df_relations = funct_example(client)
Regards!! :D
its difficult to suggest a solution without knowing the properties of all the objects being used.
'entity' object, is not defined within the function, so is it a global variable?
Does the role has 'entity', which then has the 'category' property? Im assuming as such, since role does have entity property
d = {name : "'{}' with entity '{}'".format(role.name, role.entity.text)} # <---THIS IS OK
Beside, the name variable while initialized is not used.
maybe you try
name = str([format(role.entity.category)]) + str(i)

How to optimize PRAW and pandas data collection to make it more pythonic?

I am using PRAW to get data from Reddit and created this function to do so on multiple subreddits.
It works, however, I am working on a more concise/pythonic version but can't figure out how I can create a single "for loop", doing the job of the 3 below.
subs = r.subreddit('Futurology+wallstreetbets+DataIsBeautiful+RenewableEnergy+Bitcoin')
#This function aim to scrap data from a list of subreddit.
#From these subreddit, I would like to get the #new, #hot and #rising posts
def get_data(size_new, size_hot, size_rising, subs_number):
posts = []
followers = []
targeted_date = '14-11-20 12:00:00'
targeted_date = datetime.datetime.strptime(targeted_date, '%d-%m-%y %H:%M:%S')
#getting x new posts
for subreddit in subs.new(limit = size_new):
date = subreddit.created
date = datetime.datetime.fromtimestamp(date)
if date >= targeted_date:
posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext])
#getting x hot posts
for subreddit in subs.hot(limit = size_hot):
date = subreddit.created
date = datetime.datetime.fromtimestamp(date)
if date >= targeted_date:
posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext])
#getting x rising posts
for subreddit in subs.rising(limit = size_rising):
date = subreddit.created
date = datetime.datetime.fromtimestamp(date)
if date >= targeted_date:
posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext])
#getting subreddit subscribers number
for sub_name in subs_2:
for submission in r.subreddit(sub_name).hot(limit = 1):
followers.append([submission.subreddit, r.subreddit(sub_name).subscribers])
#creating 2 df
df_1 = pd.DataFrame(followers, columns = ['subreddit','subscribers'])
df = pd.DataFrame(posts, columns = ['date', 'subreddit', 'title', 'text']).drop_duplicates().sort_values(by = ['date']).reset_index(drop = True)
#concat the 2 df together
df = df.join(df_1.set_index('subreddit'), on = 'subreddit')
df = df[["date", "subreddit", "subscribers", "title", 'text']]
df = df[df.subscribers > subs_number].reset_index(drop = True)
return df
My request: how could it be more concise/optimized? What methodology are you using to make your code more readable or even better, optimize it for run time/computational resources?
Thank you
There are various principles to make better code, and various tools to use to find the 'code smells' that may be lurking in your code.
DRY - Don't Repeat Yourself
KISS - keep it stupid simple
SOLID
etc...
Taking a dive into the code that you posted using some of the principles on a surface level would refactor some of your code into looking like:
subs = r.subreddit('Futurology+wallstreetbets+DataIsBeautiful+RenewableEnergy+Bitcoin')
# check that the date is greater than the target date
# return true/false
def check_date(subreddit, targeted_date):
return subreddit.created >= targeted_date:
# get specific post data
def get_post_data(subreddit):
return [subreddit.created, subreddit.subreddit, subreddit.title, subreddit.selftext]
# get posts by sort type
def get_subreddit_post_types(subreddit_sort, targeted_date):
return [get_post_data(subreddit) for subreddit in subreddit_sort if check_date(subreddit, targeted_date)]
#This function aim to scrap data from a list of subreddit.
#From these subreddit, I would like to get the #new, #hot and #rising posts
def get_data(size_new, size_hot, size_rising, subs_number):
targeted_date = '14-11-20 12:00:00'
targeted_date = datetime.datetime.strptime(targeted_date, '%d-%m-%y %H:%M:%S').timestamp()
posts = []
followers = []
#getting x new posts
posts.extend(get_subreddit_post_types(subs.new(limit = size_new), targeted_date))
#getting x hot posts
posts.extend(get_subreddit_post_types(subs.hot(limit = size_hot), targeted_date))
#getting x rising posts
posts.extend(get_subreddit_post_types(subs.rising(limit = size_rising), targeted_date))
#getting subreddit subscribers number
for sub_name in subs_2:
for submission in r.subreddit(sub_name).hot(limit = 1):
followers.append([submission.subreddit, r.subreddit(sub_name).subscribers])
#creating 2 df
df_1 = pd.DataFrame(followers, columns = ['subreddit','subscribers'])
df = pd.DataFrame(posts, columns = ['date', 'subreddit', 'title', 'text']).drop_duplicates().sort_values(by = ['date']).reset_index(drop = True)
#concat the 2 df together
df = df.join(df_1.set_index('subreddit'), on = 'subreddit')
df = df[["date", "subreddit", "subscribers", "title", 'text']]
df = df[df.subscribers > subs_number].reset_index(drop = True)
return df
As for better optimizing your computational resources (what are you trying to optimize memory or runtime)? The same process applies to either is to look at your code to see what can be changed to decrease one versus the other.
From looking at your code something that would generally optimize what you wrote would be to look at what are the 'duplicate' posts that you are getting. If you could remove the duplicate check (as each of the hot/rising/new get posts from similar date ranges, but hot/rising may be completely encompassed inside of new) call from the posts that you gathered, so that you don't have to check that they are different, and possibly remove hot/rising calls (because those posts may be encompassed in new).

parse xml to pandas data frame in python

I am trying to read the XML file and convert it to pandas. However it returns empty data
This is the sample of xml structure:
<Instance ID="1">
<MetaInfo StudentID ="DTSU040" TaskID="LP03_PR09.bLK.sh" DataSource="DeepTutorSummer2014"/>
<ProblemDescription>A car windshield collides with a mosquito, squashing it.</ProblemDescription>
<Question>How does this work tion?</Question>
<Answer>tthis is my best </Answer>
<Annotation Label="correct(0)|correct_but_incomplete(1)|contradictory(0)|incorrect(0)">
<AdditionalAnnotation ContextRequired="0" ExtraInfoInAnswer="0"/>
<Comments Watch="1"> The student forgot to tell the opposite force. Opposite means opposite direction, which is important here. However, one can argue that the opposite is implied. See the reference answers.</Comments>
</Annotation>
<ReferenceAnswers>
1: Since the windshield exerts a force on the mosquito, which we can call action, the mosquito exerts an equal and opposite force on the windshield, called the reaction.
</ReferenceAnswers>
</Instance>
I have tried this code, however it's not working on my side. It returns empty dataframe.
import pandas as pd
import xml.etree.ElementTree as et
xtree = et.parse("grade_data.xml")
xroot = xtree.getroot()
df_cols = ["ID", "TaskID", "DataSource", "ProblemDescription", 'Question', 'Answer',
'ContextRequired', 'ExtraInfoInAnswer', 'Comments', 'Watch', 'ReferenceAnswers']
rows = []
for node in xroot:
s_name = node.attrib.get("ID")
s_student = node.find("StudentID")
s_task = node.find("TaskID")
s_source = node.find("DataSource")
s_desc = node.find("ProblemDescription")
s_question = node.find("Question")
s_ans = node.find("Answer")
s_label = node.find("Label")
s_contextrequired = node.find("ContextRequired")
s_extraInfoinAnswer = node.find("ExtraInfoInAnswer")
s_comments = node.find("Comments")
s_watch = node.find("Watch")
s_referenceAnswers = node.find("ReferenceAnswers")
rows.append({"ID": s_name,"StudentID":s_student, "TaskID": s_task,
"DataSource": s_source, "ProblemDescription": s_desc ,
"Question": s_question , "Answer": s_ans ,"Label": s_label,
"s_contextrequired": s_contextrequired , "ExtraInfoInAnswer": s_extraInfoinAnswer ,
"Comments": s_comments , "Watch": s_watch, "ReferenceAnswers": s_referenceAnswers,
})
out_df = pd.DataFrame(rows, columns = df_cols)
The problem in your solution was that the "element data extraction" was not done properly. The xml you mentioned in the question is nested in several layers. And that is why we need to recursively read and extract the data. The following solution should give you what you need in this case. Although I would encourage you to look at this article and the python documentation for more clarity.
Method: 1
import numpy as np
import pandas as pd
#import os
import xml.etree.ElementTree as ET
def xml2df(xml_source, df_cols, source_is_file = False, show_progress=True):
"""Parse the input XML source and store the result in a pandas
DataFrame with the given columns.
For xml_source = xml_file, Set: source_is_file = True
For xml_source = xml_string, Set: source_is_file = False
<element attribute_key1=attribute_value1, attribute_key2=attribute_value2>
<child1>Child 1 Text</child1>
<child2>Child 2 Text</child2>
<child3>Child 3 Text</child3>
</element>
Note that for an xml structure as shown above, the attribute information of
element tag can be accessed by list(element). Any text associated with <element> tag can be accessed
as element.text and the name of the tag itself can be accessed with
element.tag.
"""
if source_is_file:
xtree = ET.parse(xml_source) # xml_source = xml_file
xroot = xtree.getroot()
else:
xroot = ET.fromstring(xml_source) # xml_source = xml_string
consolidator_dict = dict()
default_instance_dict = {label: None for label in df_cols}
def get_children_info(children, instance_dict):
# We avoid using element.getchildren() as it is deprecated.
# Instead use list(element) to get a list of attributes.
for child in children:
#print(child)
#print(child.tag)
#print(child.items())
#print(child.getchildren()) # deprecated method
#print(list(child))
if len(list(child))>0:
instance_dict = get_children_info(list(child),
instance_dict)
if len(list(child.keys()))>0:
items = child.items()
instance_dict.update({key: value for (key, value) in items})
#print(child.keys())
instance_dict.update({child.tag: child.text})
return instance_dict
# Loop over all instances
for instance in list(xroot):
instance_dict = default_instance_dict.copy()
ikey, ivalue = instance.items()[0] # The first attribute is "ID"
instance_dict.update({ikey: ivalue})
if show_progress:
print('{}: {}={}'.format(instance.tag, ikey, ivalue))
# Loop inside every instance
instance_dict = get_children_info(list(instance),
instance_dict)
#consolidator_dict.update({ivalue: instance_dict.copy()})
consolidator_dict[ivalue] = instance_dict.copy()
df = pd.DataFrame(consolidator_dict).T
df = df[df_cols]
return df
Run the following to generate the desired output.
xml_source = r'grade_data.xml'
df_cols = ["ID", "TaskID", "DataSource", "ProblemDescription", "Question", "Answer",
"ContextRequired", "ExtraInfoInAnswer", "Comments", "Watch", 'ReferenceAnswers']
df = xml2df(xml_source, df_cols, source_is_file = True)
df
Method: 2
Given you have the xml_string, you could convert xml >> dict >> dataframe. run the following to get the desired output.
Note: You will need to install xmltodict to use Method-2. This method is inspired by the solution suggested by #martin-blech at How to convert XML to JSON in Python? [duplicate]
. Kudos to #martin-blech for making it.
pip install -U xmltodict
Solution
def read_recursively(x, instance_dict):
#print(x)
txt = ''
for key in x.keys():
k = key.replace("#","")
if k in df_cols:
if isinstance(x.get(key), dict):
instance_dict, txt = read_recursively(x.get(key), instance_dict)
#else:
instance_dict.update({k: x.get(key)})
#print('{}: {}'.format(k, x.get(key)))
else:
#print('else: {}: {}'.format(k, x.get(key)))
# dig deeper if value is another dict
if isinstance(x.get(key), dict):
instance_dict, txt = read_recursively(x.get(key), instance_dict)
# add simple text associated with element
if k=='#text':
txt = x.get(key)
# update text to corresponding parent element
if (k!='#text') and (txt!=''):
instance_dict.update({k: txt})
return (instance_dict, txt)
You will need the function read_recursively() given above. Now run the following.
import xmltodict, json
o = xmltodict.parse(xml_string) # INPUT: XML_STRING
#print(json.dumps(o)) # uncomment to see xml to json converted string
consolidated_dict = dict()
oi = o['Instances']['Instance']
for x in oi:
instance_dict = dict()
instance_dict, _ = read_recursively(x, instance_dict)
consolidated_dict.update({x.get("#ID"): instance_dict.copy()})
df = pd.DataFrame(consolidated_dict).T
df = df[df_cols]
df
Several issues:
Calling .find on the loop variable, node, expects a child node to exist: current_node.find('child_of_current_node'). However, since all the nodes are the children of root they do not maintain their own children, so no loop is required;
Not checking NoneType that can result from missing nodes with find() and prevents retrieving .tag or .text or other attributes;
Not retrieving node content with .text, otherwise the <Element... object is returned;
Consider this adjustment using the ternary condition expression a if condition else b to ensure variable has a value regardless:
rows = []
s_name = xroot.attrib.get("ID")
s_student = xroot.find("StudentID").text if xroot.find("StudentID") is not None else None
s_task = xroot.find("TaskID").text if xroot.find("TaskID") is not None else None
s_source = xroot.find("DataSource").text if xroot.find("DataSource") is not None else None
s_desc = xroot.find("ProblemDescription").text if xroot.find("ProblemDescription") is not None else None
s_question = xroot.find("Question").text if xroot.find("Question") is not None else None
s_ans = xroot.find("Answer").text if xroot.find("Answer") is not None else None
s_label = xroot.find("Label").text if xroot.find("Label") is not None else None
s_contextrequired = xroot.find("ContextRequired").text if xroot.find("ContextRequired") is not None else None
s_extraInfoinAnswer = xroot.find("ExtraInfoInAnswer").text if xroot.find("ExtraInfoInAnswer") is not None else None
s_comments = xroot.find("Comments").text if xroot.find("Comments") is not None else None
s_watch = xroot.find("Watch").text if xroot.find("Watch") is not None else None
s_referenceAnswers = xroot.find("ReferenceAnswers").text if xroot.find("ReferenceAnswers") is not None else None
rows.append({"ID": s_name,"StudentID":s_student, "TaskID": s_task,
"DataSource": s_source, "ProblemDescription": s_desc ,
"Question": s_question , "Answer": s_ans ,"Label": s_label,
"s_contextrequired": s_contextrequired , "ExtraInfoInAnswer": s_extraInfoinAnswer ,
"Comments": s_comments , "Watch": s_watch, "ReferenceAnswers": s_referenceAnswers
})
out_df = pd.DataFrame(rows, columns = df_cols)
Alternatively, run a more dynamic version assigning to an inner dictionary using the iterator variable:
rows = []
for node in xroot:
inner = {}
inner[node.tag] = node.text
rows.append(inner)
out_df = pd.DataFrame(rows, columns = df_cols)
Or list/dict comprehension:
rows = [{node.tag: node.text} for node in xroot]
out_df = pd.DataFrame(rows, columns = df_cols)

TypeError: 'DataFrame' object is not callable python function

I have two functions, one which creates a dataframe from a csv and another which manipulates that dataframe. There is no problem the first time I pass the raw data through the lsc_age(import_data()) functions. However, I get the above-referenced error (TypeError: 'DataFrame' object is not callable) upon second+ attempts. Any ideas for how to solve the problem?
def import_data(csv,date1,date2):
global data
data = pd.read_csv(csv,header=1)
data = data.iloc[:,[0,1,4,6,7,8,9,11]]
data = data.dropna(how='all')
data = data.rename(columns={"National: For Dates 9//1//"+date1+" - 8//31//"+date2:'event','Unnamed: 1':'time','Unnamed: 4':'points',\
'Unnamed: 6':'name','Unnamed: 7':'age','Unnamed: 8':'lsc','Unnamed: 9':'club','Unnamed: 11':'date'})
data = data.reset_index().drop('index',axis=1)
data = data[data.time!='Time']
data = data[data.points!='Power ']
data = data[data['event']!="National: For Dates 9//1//"+date1+" - 8//31//"+date2]
data = data[data['event']!='USA Swimming, Inc.']
data = data.reset_index().drop('index',axis=1)
for i in range(len(data)):
if len(str(data['event'][i])) <= 3:
data['event'][i] = data['event'][i-1]
else:
data['event'][i] = data['event'][i]
data = data.dropna()
age = []
event = []
gender = []
for row in data.event:
gender.append(row.split(' ')[0])
if row[:9]=='Female 10':
n = 4
groups = row.split(' ')
age.append(' '.join(groups[1:n]))
event.append(' '.join(groups[n:]))
elif row[:7]=='Male 10':
n = 4
groups = row.split(' ')
age.append(' '.join(groups[1:n]))
event.append(' '.join(groups[n:]))
else:
n = 2
groups = row.split(' ')
event.append(' '.join(groups[n:]))
groups = row.split(' ')
age.append(groups[1])
data['age_group'] = age
data['event_simp'] = event
data['gender'] = gender
data['year'] = date2
return data
def lsc_age(data_two):
global lsc, lsc_age, top, all_performers
lsc = pd.DataFrame(data_two['event'].groupby(data_two['lsc']).count()).reset_index().sort_values(by='event',ascending=False)
lsc_age = data_two.groupby(['year','age_group','lsc'])['event'].count().reset_index().sort_values(by=['age_group','event'],ascending=False)
top = pd.concat([lsc_age[lsc_age.age_group=='10 & under'].head(),lsc_age[lsc_age.age_group=='11-12'].head(),\
lsc_age[lsc_age.age_group=='13-14'].head(),lsc_age[lsc_age.age_group=='15-16'].head(),\
lsc_age[lsc_age.age_group=='17-18'].head()],ignore_index=True)
all_performers = pd.concat([lsc_age[lsc_age.age_group=='10 & under'],lsc_age[lsc_age.age_group=='11-12'],\
lsc_age[lsc_age.age_group=='13-14'],lsc_age[lsc_age.age_group=='15-16'],\
lsc_age[lsc_age.age_group=='17-18']],ignore_index=True)
all_performers = all_performers.rename(columns={'event':'no. top 100'})
all_performers['age_year_lsc'] = all_performers.age_group+' '+all_performers.year.astype(str)+' '+all_performers.lsc
return all_performers
years = [i for i in range(2008,2018)]
for i in range(len(years)-1):
lsc_age(import_data(str(years[i+1])+"national100.csv",\
str(years[i]),str(years[i+1])))
During the first call to your function lsc_age() in line
lsc_age = data_two.groupby(['year','age_group','lsc'])['event'].count().reset_index().sort_values(by=['age_group','event'],ascending=False)
you are overwriting your function object with a dataframe. This is happening since you imported the function object from the global namespace with
global lsc, lsc_age, top, all_performers
Functions in Python are objects. Please see more information about this here.
To solve your problem, try to avoid the global imports. They do not seem to be necessary. Try to pass your data around through the arguments of the function.

Reducing RAM consumption of Python dict

I have a python script that process several files of some gigabytes. With the following code I show below, I store some data into a list, which is stored into a dictionary snp_dict. The RAM consumption is huge. Looking at my code, could you suggest some ways to reduce RAM consumption, if any?
def extractAF(files_vcf):
z=0
snp_dict=dict()
for infile_name in sorted(files_vcf):
print ' * ' + infile_name
###single files
vcf_reader = vcf.Reader(open(infile_name, 'r'))
for record in vcf_reader:
snp_position='_'.join([record.CHROM, str(record.POS)])
ref_F = float(record.INFO['DP4'][0])
ref_R = float(record.INFO['DP4'][1])
alt_F = float(record.INFO['DP4'][2])
alt_R = float(record.INFO['DP4'][3])
AF = (alt_F+alt_R)/(alt_F+alt_R+ref_F+ref_R)
if not snp_position in snp_dict:
snp_dict[snp_position]=list((0) for _ in range(len(files_vcf)))
snp_dict[snp_position][z] = round(AF, 3) #record.INFO['DP4']
z+=1
return snp_dict
I finally adopted the following implementation with MySQL:
for infile_name in sorted(files_vcf):
print infile_name
###single files
vcf_reader = vcf.Reader(open(infile_name, 'r'))
for record in vcf_reader:
snp_position='_'.join([record.CHROM, str(record.POS)])
ref_F = float(record.INFO['DP4'][0])
ref_R = float(record.INFO['DP4'][1])
alt_F = float(record.INFO['DP4'][2])
alt_R = float(record.INFO['DP4'][3])
AF = (alt_F+alt_R)/(alt_F+alt_R+ref_F+ref_R)
if not snp_position in snp_dict:
sql_insert_table = "INSERT INTO snps VALUES ('" + snp_position + "'," + ",".join(list(('0') for _ in range(len(files_vcf)))) + ")"
cursor = db1.cursor()
cursor.execute(sql_insert_table)
db1.commit()
snp_dict.append(snp_position)
sql_update = "UPDATE snps SET " + str(z) + "g=" + str(AF) + " WHERE snp_pos='" + snp_position + "'";
cursor = db1.cursor()
cursor.execute(sql_update)
db1.commit()
z+=1
return snp_dict
For this sort of thing, you are probably better off using another data structure. A pandas DataFrame would work well in your situation.
The simplest solution would be to use an existing library, rather than writing your own parser. vcfnp can read vcf files into a format that is easily convertible to a pandas DataFrame. Something like this should work:
import pandas as pd
def extractAF(files_vcf):
dfs = []
for fname in sorted(files_vcf):
vars = vcfnp.variants(fname, fields=['CHROM', 'POS', 'DP4'])
snp_pos = np.char.add(np.char.add(vars.CHROM, '_'), record.POS.astype('S'))
dp4 = vars.DP4.astype('float')
AF = dp4[2:].sum(axis=0)/dp4.sum(axis=0)
dfs.append(pd.DataFrame(AF, index=snp_pos, columns=[fname]).T)
return pd.concat(dfs).fillna(0.0)
If you absolutely must use PyVCF, it will be slower, but hopefully this will at least be faster than your existing implementation, and should produce the same result as the above code:
def extractAF(files_vcf):
files_vcf = sorted(files_vcf)
dfs = []
for fname in files_vcf:
print ' * ' + fname
vcf_reader = vcf.Reader(open(fname, 'r'))
vars = ((rec.CHROM, rec.POS) + tuple(rec.INFO['DP4']) for rec in vcf_reader)
df = pd.DataFrame(vars, columns=['CHROMS', 'POS', 'ref_F', 'ref_R', 'alt_F', 'alt_R'])
df['snp_position'] = df['CHROMS'] + '_' + df['POS'].astype('S')
df_alt = df.loc[:, ('alt_F', 'alt_R')]
df_dp4 = df.loc[:, ('alt_F', 'alt_R', 'ref_F', 'ref_R')]
df[fname] = df_alt.sum(axis=1)/df_dp4.sum(axis=1)
df = df.set_index('snp_position', drop=True).loc[:, fname:fname].T
dfs.append(df)
return pd.concat(dfs).fillna(0.0)
Now lets say you wanted to read a particular snp_position, say contained in a variable snp_pos, that may or may not be there (from your comment), you wouldn't actually have to change anything:
all_vcf = extractAF(files_vcf)
if snp_pos in all_vcf:
linea_di_AF = all_vcf[snp_pos]
The result will be slightly different, though. It will be a pandas Series, which is like an array but can also be accessed like a dictionary:
all_vcf = extractAF(files_vcf)
if snp_pos in all_vcf:
linea_di_AF = all_vcf[snp_pos]
f_di_AF = linea_di_AF[files_vcf[0]]
This allows you to access a particular file/snp_pos pair directly:
all_vcf = extractAF(files_vcf)
if snp_pos in all_vcf:
f_di_AF = linea_di_AF[snp_pos][files_vcf[0]]
Or, better yet:
all_vcf = extractAF(files_vcf)
if snp_pos in all_vcf:
f_di_AF = linea_di_AF.loc[files_vcf[0], snp_pos]
Or you can get all snp_pos values for a given file:
all_vcf = extractAF(files_vcf)
fpos = linea_di_AF.loc[fname]

Categories

Resources