I have some hectic task to do for which I need some help from python. Please see this word document.
I am to extract texts and GPS coordinates from each row. There are currently over 100 coordinates in 10 docx file. My "hefty" python knowledge get me to this.
from docx import Document
import re
main_file = Document("D:/DOCUMENTS/Google_Link/1 Category I/1 Category
table = main_file.tables[1] #this is same for every document
data = []
keys = None
for i, row in enumerate(table.rows):
text = (cell.text for cell in row.cells)
if i == 0:
keys = tuple(text)
row_data = tuple(text)
regexReference = re.compile("(C.-)\w+")
colReference = [item[1] for item in data]
listReference = filter(regexReference.match, colReference)
for i in listReference:
print i.encode('UTF-8')
I can print 16 reference ids from column 2. Please guide me to print something like this.
some site, some region
The existing CMC Office at Bariyodhala (22°40'34.3"N; 91°38'28.2"E) requires
some repair/maintenance works including electrical wiring and electrical
lights and appliances like ceiling fans supplies. Detail specification of
the works are attached
x = 91°38'28.2"E
y = 22°40'34.3"N
These XY locations and descritions will be used to create KML files afterwards and attach with each document. I'd prefer a variable for each part of the above section (ref id, location, description, x and y) so that I can automate that as well.
demo docx
I don't know if this works if there are files with different patterns (p.s. I'm using python 2.7.11):
# -*- coding: utf-8 -*-
from docx import Document
import sys
import os
import re
for root, dirs, files in os.walk("."):
for name in files:
doc_file = os.path.join(root, name)
if doc_file.endswith('docx'):
main_file = Document(doc_file)
table = main_file.tables[1] # this is same for every document
data = []
keys = None
for i, row in enumerate(table.rows):
text = (cell.text for cell in row.cells)
if i == 0:
keys = tuple(text)
row_data = tuple(text)
regexReference = re.compile("(C.-[0-9-]+)")
regexCoordinate = re.compile(r'(N-(.{,12})([0-9]|\')|[0-9].{,12}N)[;, ]+(E-(.{,12})([0-9]|\')|[0-9].{,12}E)')
result = []
for item in data:
tmp = dict()
matchReference = regexReference.search(item[1])
matchCoordinate = regexCoordinate.search(unicode(item[2]))
if matchReference:
tmp['reference'] = matchReference.group()
if matchCoordinate:
tmp['x'] = matchCoordinate.group(1)
tmp['y'] = matchCoordinate.group(4)
tmp['description'] = unicode(item[2])
tmp['location'] = unicode(item[3])
for rs in result:
if 'reference' in rs:
for k, v in rs.iteritems():
print('{} = {}'.format(k, v))
# Output:
# --------------------------------
# y = 91°38'28.2"E
# x = 22°40'34.3"N
# description = The existing CMC Office at Bariyodhala (22°40'34.3"N; 91°38'28.2"E) requires some repair/maintenance works including electrical wiring and electrical lights and appliances like ceiling fans supplies. Detail specification of the works are attached.
# reference = C1-20701-17-1
# location = xxxxx Site, c Region
I have a csv file that is generated that has some information in the first line. I'm trying to skip it but it doesn't seem to work. I tried looking at several suggestions and examples.
I tried using skiprows.
I also looked at several other examples.
Pandas drop first columns after csv read
Nothing I tried worked the way I wanted it.
When I got it to work it deleted the entire row.
Here is a sample of the code
# Imports the Pandas Module. It must be installed to run this script.
import pandas as pd
# Gets source file link
source_file = 'Csvfile.csv'
# Gets csv file and encodes it into a format that is compatible.
dataframe = pd.read_csv(source_copy, encoding='latin1')
df = pd.DataFrame({'User': dataframe.User, 'Pages': dataframe.Pages, 'Copies': dataframe.Copies,
'Color': dataframe.Grayscale, 'Duplex': dataframe.Duplex, 'Printer': dataframe.Printer})
# Formats data so that it can be used to count Duplex and Color pages.
df.loc[df["Duplex"] == "DUPLEX", "Duplex"] = dataframe.Pages
df.loc[df["Duplex"] == "NOT DUPLEX", "Duplex"] = 0
df.loc[df["Color"] == "NOT GRAYSCALE", "Color"] = dataframe.Pages
df.loc[df["Color"] == "GRAYSCALE", "Color"] = 0
df.sort_values(by=['User', 'Pages'])
file = df.to_csv('PrinterLogData.csv', index=False)
# Opens parsed CSV file.
output_source = "PrinterLogData.csv"
dataframe = pd.read_csv(output_source, encoding='latin1')
# Creates new DataFrame.
df = pd.DataFrame({'User': dataframe.User, 'Pages': dataframe.Pages, 'Copies': dataframe.Copies,
'Color': dataframe.Color, 'Duplex': dataframe.Duplex, 'Printer':
# Groups data by Users and Printer Sums
Report1 = df.groupby(['User'], as_index=False).sum().sort_values('Pages', ascending=False)
Report2 = (df.groupby(['Printer'], as_index=False).sum()).sort_values('Pages', ascending=False)
Sample Data
Sample Output of what I'm looking for.
This is an early draft of what you appear to want for your program (based on the simulated print-log.csv):
import csv
import itertools
import operator
import pathlib
CSV_FILE = pathlib.Path('print-log.csv')
EXTRA_COLUMNS = ['Pages', 'Grayscale', 'Color', 'Not Duplex', 'Duplex']
def main():
with CSV_FILE.open('rt', newline='') as file:
iterator = iter(file)
next(iterator) # skip first line if needed
reader = csv.DictReader(iterator)
table = list(reader)
create_report(table, 'Printer')
create_report(table, 'User')
def create_report(table, column_name):
key = operator.itemgetter(column_name)
field_names = [column_name] + EXTRA_COLUMNS
with pathlib.Path(f'{column_name} Report').with_suffix('.csv').open(
'wt', newline=''
) as file:
writer = csv.DictWriter(file, field_names)
report = []
for key, group in itertools.groupby(table, key):
report.append({column_name: key} | analyze_group(group))
report.sort(key=operator.itemgetter('Pages'), reverse=True)
def analyze_group(group):
summary = dict.fromkeys(EXTRA_COLUMNS, 0)
for row in group:
pages = int(row['Pages']) * int(row['Copies'])
summary['Pages'] += pages
summary['Grayscale'] += pages if row['Grayscale'] == 'GRAYSCALE' else 0
summary['Color'] += pages if row['Grayscale'] == 'NOT GRAYSCALE' else 0
summary['Not Duplex'] += pages if row['Duplex'] == 'NOT DUPLEX' else 0
summary['Duplex'] += pages if row['Duplex'] == 'DUPLEX' else 0
return summary
if __name__ == '__main__':
I read a .xlsx file, update it but Im not able to save it
from xml.dom import minidom as md
[... some code ....]
sheet = workDir + '/xl/worksheets/sheet'
sheet1 = sheet + '1.xml'
importSheet1 = open(sheet1,'r')
whole_file= importSheet1.read()
data_Sheet = md.parseString(whole_file)
[... some code ....]
self.array_mem_name = []
y = 1
x = 5 #first useful row
day = int(day)
found = 0
while x <= len_array_shared:
readrow = data_Sheet.getElementsByTagName('row')[x]
c_data = readrow.getElementsByTagName('c')[0]
c_attrib = c_data.getAttribute('t')
if c_attrib == 's':
vName = c_data.getElementsByTagName('v')[0].firstChild.nodeValue
#if int(vName) != broken:
mem_name = self.array_shared[int(vName)]
if mem_name != '-----':
if mem_name == old:
c_data = readrow.getElementsByTagName('c')[day]
c_attrib = c_data.getAttribute('t')
if (c_attrib == 's'):
v_Attrib = c_data.getElementsByTagName('v')[0].firstChild.nodeValue
if v_Attrib != '':
#loc = self.array_shared[int(v_Attrib)]
index = self.array_shared.index('--')
c_data.getElementsByTagName('v')[0].firstChild.nodeValue = index
with open(sheet1, 'w') as f:
As you can see I use f.write(whole_file) but whole_file has not the changes made with index.
Checking the debug I see that the new value has been added to the node, but I can't save sheet1 with the modified value
I switched to using openpyxl instead, as was suggested in a comment by Lei Yang. I found that this tool worked better for my jobs. With openpyxl, reading cell values is much easier than with xml.dom.minidom.
My only concern is that openpyxl seems really slower than the dom to load the workbook. Maybe the memory was overloaded. But, I was more interested in using something simpler than this minor performance issue.
I'm a relative novice at python but yet, somehow managed to build a scraper for Instagram. I now want to take this one step further and output the 5 most commonly used hashtags from an IG profile into my CSV output file.
Current output:
I've managed to isolate the 5 most commonly used hashtags, but I get this result in my csv:
[('#striveforgreatness', 3), ('#jamesgang', 3), ('#thekidfromakron',
2), ('#togetherwecanchangetheworld', 1), ('#halloweenchronicles', 1)]
Desired output:
What I'm looking to end up with in the end is having 5 columns at the end of my .CSV outputting the X-th most commonly used value.
So something in the lines of this:
I've Googled for a while and managed to isolate them separately, but I always end up with '('#thekidfromakron', 2)' as an output. I seem to be missing some part of the puzzle :(.
Here is what I'm working with at the moment:
import csv
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from collections import Counter
ts = time.gmtime()
def get_csv_header(top_numb):
return fieldnames
def write_csv_header(filename, headers):
with open(filename, 'w', newline='') as f_out:
writer = csv.DictWriter(f_out, fieldnames=headers)
def read_user_name(t_file):
with open(t_file) as f:
user_list = f.read().splitlines()
return user_list
if __name__ == '__main__':
# Which contains a list of usernames's BY DEFAULT <current working directory>/userlist.txt
USER_FILE = 'userlist.txt'
# HERE YOU CAN SPECIFY YOUR DATA FILE NAME, BY DEFAULT (data.csv)', Where your final result stays
DATA_FILE = 'users_with_er.csv'
print('Starting the engagement calculations... Please wait until it finishes!')
users = read_user_name(USER_FILE)
""" Writing data to csv file """
csv_headers = get_csv_header(MAX_POST)
write_csv_header(DATA_FILE, csv_headers)
for user in users:
post_info = {'USER': user}
url = 'https://www.instagram.com/' + user + '/'
#for troubleshooting, un-comment the next two lines:
r = requests.get(url)
if r.status_code != 200:
print(timestamp,' user {0} not found or page unavailable! Skipping...'.format(user))
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
timestamp = time.strftime("%d-%m-%Y %H:%M:%S", ts)
except ValueError:
print(timestamp,'ValueError for username {0}...Skipping...'.format(user))
except IndexError as error:
# Output expected IndexErrors.
print(timestamp, error)
if j['graphql']['user']['edge_followed_by']['count'] <=0:
print(timestamp,'user {0} has no followers! Skipping...'.format(user))
if j['graphql']['user']['edge_owner_to_timeline_media']['count'] <12:
print(timestamp,'user {0} has less than 12 posts! Skipping...'.format(user))
if j['graphql']['user']['is_private'] is True:
print(timestamp,'user {0} has a private profile! Skipping...'.format(user))
media_count = j['graphql']['user']['edge_owner_to_timeline_media']['count']
accountname = j['graphql']['user']['username']
followercount = j['graphql']['user']['edge_followed_by']['count']
bio = j['graphql']['user']['biography']
i = 0
total_likes = 0
total_comments = 0
all_captiontext = ''
while i <= 11:
total_likes += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_liked_by']['count']
total_comments += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_comment']['count']
captions = j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_caption']
caption_detail = captions['edges'][0]['node']['text']
all_captiontext += caption_detail
i += 1
engagement_rate_percentage = '{0:.4f}'.format((((total_likes + total_comments) / followercount)/12)*100) + '%'
engagement_rate = (((total_likes + total_comments) / followercount)/12*100)
#isolate and count hashtags
hashtags = re.findall(r'#\w*', all_captiontext)
hashtags_counted = Counter(hashtags)
most_common = hashtags_counted.most_common(5)
with open('users_with_er.csv', 'a', newline='', encoding='utf-8') as data_out:
print(timestamp,'Writing Data for user {0}...'.format(user))
post_info["USER"] = accountname
post_info["FOLLOWERCOUNT"] = followercount
post_info["MEDIA COUNT"] = media_count
post_info["TOTAL LIKES"] = total_likes
post_info["TOTAL COMMENTS"] = total_comments
post_info["ER"] = engagement_rate
post_info["ER IN %"] = engagement_rate_percentage
post_info["BIO"] = bio
post_info["ALL CAPTION TEXT"] = all_captiontext
post_info["HASHTAGS COUNTED"] = hashtags_counted
csv_writer = csv.DictWriter(data_out, fieldnames=csv_headers)
""" Done with the script """
print('ALL DONE !!!! ')
The code that goes before this simply scrapes the webpage, and compiles all the captions from the last 12 posts into "all_captiontext".
Any help to solve this (probably simple) issue would be greatly appreciated as I've been struggling with this for days (again, I'm a noob :') ).
Replace line
post_info["MOST COMMON HASHTAGS"] = most_common
for i, counter_tuple in enumerate(most_common):
tag_name = counter_tuple[0].replace('#','')
label = "Top %d" % (i + 1)
post_info[label] = tag_name
There's also a bit of code missing. For example, your code doesn't include csv_headers variable, which I suppose would be
csv_headers = post_info.keys()
It also seems that you're opening a file to write just one row. I don't think that's intended, so what you would like to do is to collect the results into a list of dictionaries. A cleaner solution would be to use pandas' dataframe, which you can output straight into a csv file.
most_common being the output of the call to hashtags_counted.most_common, I had a look at the doc here: https://docs.python.org/2/library/collections.html#collections.Counter.most_common
Output if formatted the following : [(key, value), (key, value), ...] and ordered in decreasing importance of number of occurences.
Hence, to get only the name and not the number of occurence, you should replace:
post_info["MOST COMMON HASHTAGS"] = most_common
post_info["MOST COMMON HASHTAGS"] = [x[0] for x in most_common]
You have a list of tuple. This statement builds on the fly the list of the first element of each tuple, keeping the sorting order.
I have extracted the data using python Selenium from the site below.
Please have a look at the table "Metrics Comparison for Your Design and/or Target".
I have extracted the table as a text format.
Here is the sample output of the text below
Metric Design Project Design Target Median Property*
ENERGY STAR score (1-100) Not Available 75 50
Source EUI (kBtu/ft²) 3.1 Not Available 127.9
Site EUI (kBtu/ft²) 1.0 Not Available 40.7
Source Energy Use (kBtu) 314.0 Not Available 12,793.0
Site Energy Use (kBtu) 100.0 Not Available 4,074.2
Energy Cost ($) 2,000.00 Not Available 81,484.00
Total GHG Emissions (Metric Tons CO2e) 0.0 Not Available 0.5
I tried to convert the text into json,
import csv
import json
with open('file.txt', 'rb') as csvfile:
filereader = csv.reader(csvfile, delimiter=' ')
i = 0
header = []
out_data = []
for row in filereader:
row = [elem for elem in row if elem]
if i == 0:
i += 1
header = row
row[0:4] = [row[0]+" "+row[1]+" "+row[2]+" "+row[3]]
_dict = {}
for elem, header_elem in zip(row, header):
_dict[header_elem] = elem
print json.dumps(out_data)
The JSON format output which i got was like
[{"Project": "75", "Metric": "ENERGY STAR score (1-100)", "Design": "50"}]
The JSON format output should be in the form of
[{"Design Project": "Not Available", "Design Target": "75", "Metric": "ENERGY STAR score (1-100)", "Median Property*": "50"}]
You forgotten create data and header for other json keys (like Design
Project, Design Target etc)
This is correct version:
import csv
import json
with open('test.txt', 'r') as csvfile: # Opens file
filereader = csv.reader(csvfile, delimiter=' ')
i = 0
header = []
out_data = []
for row in filereader:
row = [elem for elem in row if elem]
if i == 0:
i += 2
row[1:3] = [row[1]+" "+row[2]] # Design Project key
row[2:4] = [row[2]+" "+row[3]] # Design Target key
row[3:5] = [row[3]+" "+row[4]] # Median Property*
header = row
row[0:4] = [row[0]+" "+row[1]+" "+row[2]+" "+row[3]] # Metric value
if len(row) == 5: # check conditions for better parse
row[1:3] = [row[1]+" "+row[2]] # Design Project value
_dict = {}
for elem, header_elem in zip(row, header):
_dict[header_elem] = elem
print json.dumps(out_data)
It work only if structure of your data is constant, and key/value consists of the same number of words.
You can add additional conditions (like me in line 21):
if len(row) == 5: # check conditions for better parse
row[1:3] = [row[1]+" "+row[2]] # Design Project value
Edit : I know feature.type will give gene/CDS and feature.qualifiers will then give "db_xref"/"locus_tag"/"inference" etc. Is there a feature. object which will allow me to access the location (eg: [5240:7267](+) ) directly?
This URL give a bit more info, though I can't figure out how to use it for my purpose... http://biopython.org/DIST/docs/api/Bio.SeqFeature.SeqFeature-class.html#location_operator
Original Post:
I am trying to modify the location of features within a GenBank file. Essentially, I want to modify the following bit of a GenBank file:
gene 5240..7267
CDS 5240..7267
/inference="protein motif:PROSITE:PS00177"
gene 5357..7267
CDS 5357..7267
/inference="protein motif:PROSITE:PS00177"
Note the changes from 5240 to 5357
So far, from scouring the internet and Stackoverflow, I have:
from Bio import SeqIO
gb_file = "mtbtomod.gb"
gb_record = SeqIO.parse(open(gb_file, "r+"), "genbank")
rvnumber = 'Rv0005'
newstart = 5357
final_features = []
for record in gb_record:
for feature in record.features:
if feature.type == "gene":
if feature.qualifiers["locus_tag"][0] == rvnumber:
if feature.location.strand == 1:
feature.qualifiers["amend_position"] = "%s:%s" % (newstart, feature.location.end+1)
# do the reverse for the complementary strand
record.features = final_features
with open("testest.gb","w") as testest:
SeqIO.write(record, testest, "genbank")
This basically creates a new qualifier called "amend_position".. however, what I would like to do is modify the location directly (with or without creating a new file...)
Rv0005 is just an example of a locus_tag I need to update. I have about 600 more locations to update, which explains the need for a script.. Help!
Ok, I have something which now fully works. I'll post the code in case anyone ever needs something similar
__author__ = 'Kavin'
from Bio import SeqIO
from Bio import SeqFeature
import xlrd
import sys
import re
workbook = xlrd.open_workbook(sys.argv[2])
sheet = workbook.sheet_by_index(0)
data = [[sheet.cell_value(r, c) for c in range(sheet.ncols)] for r in range(sheet.nrows)]
# Create dicts to store TSS data
TSS = {}
row = {}
# For each entry (row), store the startcodon and strand information
for i in range(2, sheet.nrows - 1):
if data[i][5] < -0.7: # Ensures BASS score is within significant range
Gene = data[i][0]
row['Direction'] = str(data[i][3])
row['StartCodon'] = int(data[i][4])
TSS[str(Gene)] = row
row = {}
i += 1
# Create an output filename based on input filename
outfile_init = re.search('(.*)\.(\w*)', sys.argv[1])
outfile = str(outfile_init.group(1)) + '_modified.' + str(outfile_init.group(2))
final_features = []
for record in SeqIO.parse(open(sys.argv[1], "r"), "genbank"):
for feature in record.features:
if feature.type == "gene" or feature.type == "CDS":
if TSS.has_key(feature.qualifiers["locus_tag"][0]):
newstart = TSS[feature.qualifiers["locus_tag"][0]]['StartCodon']
if feature.location.strand == 1:
feature.location = SeqFeature.FeatureLocation(SeqFeature.ExactPosition(newstart - 1),
feature.location = SeqFeature.FeatureLocation(
SeqFeature.ExactPosition(newstart), feature.location.strand)
final_features.append(feature) # Append final features
record.features = final_features
with open(outfile, "w") as new_gb:
SeqIO.write(record, new_gb, "genbank")
This assumes usage such as python program.py <genbankfile> <excel spreadsheet>
This also assumes a spreadsheet of the following format:
Gene Synonym Tuberculist_annotated_start Orientation Re-annotated_start BASS_score
Rv0005 gyrB 5240 + 5357 -1.782
Rv0012 Rv0012 14089 + 14134 -1.553
Rv0018c pstP 23181 - 23172 -2.077
Rv0032 bioF2 34295 + 34307 -0.842
Rv0037c Rv0037c 41202 - 41163 -0.554
So, you can try something like below. As the number of change will be equal to the number of CDS/genes found in the file. You can read the locations/positions from csv/text file and make a list like I manually made change_values.
import re
f = open("test.txt")
change_values=["1111", "2222"]
flag = True
next = 0
for i in f.readlines():
if i.startswith(' CDS') or i.startswith(' gene'):
out = re.sub(r"\d+", str(change_values[next]), i)
#Instead of print write
print out
flag = not flag
if flag==True:
next += 1
#Instead of print write
print i
Amy sample test.txt file looks like this:
gene 5240..7267
CDS 5240..7267
/inference="protein motif:PROSITE:PS00177"
gene 5240..7267
CDS 5240..7267
/inference="protein motif:PROSITE:PS00177"
Hope, this will solve your issue. Cheers!
I think this can be done with native biopython synthax, no regex
needed, minimal working example here:
from Bio import SeqIO
from Bio import SeqFeature
import copy
gbk = SeqIO.read('./test_gbk', 'gb')
index = 1
feature_to_change = copy.deepcopy(gbk.features[index]) #depends what feature you want to change,
#can also be done with loop if you want to change them all, or by some function...
new_start = 0
new_end = 100
new_feature_location = SeqFeature.FeatureLocation(new_start, new_end, feature.location.strand) #create a new feature location object
feature_to_change.location = new_feature_location #change old feature location
del gbk.features[index] #remove changed feature
gkb.features.append(feature_to_change) #add identical feature with new location
gbk.features = sorted(gbk.features, key = lambda feature: feature.location.start) # if you want them sorted by the start of the location, which is the usual case
SeqIO.write(gbk, './test_gbk_with_new_feature', 'gb')