Related
username = request.json['username']
output = subprocess.check_output("python3 sherlock.py "+username, shell=True)
rso = str(output).split(sep='Input')[0]
buffer = []
for item in output:
tmp = str(item).split("\\n")
tmp2 = {
tmp[0].replace((': ','":"')):tmp[1].replace(" ", "")
}
buffer.append(tmp2)
print(rso)
return jsonify({"msg":str(buffer)})
this is the result that I have to convert:
{
"msg": "b'[*] Checking username aylinmari_ on:\\n[+] CapFriendly: https://www.capfriendly.com/users/aylinmari_\\n[+] Codecademy: https://www.codecademy.com/profiles/aylinmari_\\n[+] Coil: https://coil.com/u/aylinmari_\\n[+] Facenama: https://facenama.com/aylinmari_\\n[+] Fiverr: https://www.fiverr.com/aylinmari_\\n"
}
This is sherlock.py:
import csv
import os
import platform
import re
import sys
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from time import monotonic
import requests
from requests_futures.sessions import FuturesSession
from torrequest import TorRequest
from result import QueryStatus
from result import QueryResult
from notify import QueryNotifyPrint
from sites import SitesInformation
module_name = "Sherlock: Find Usernames Across Social Networks"
__version__ = "0.14.0"
class SherlockFuturesSession(FuturesSession):
def request(self, method, url, hooks={}, *args, **kwargs):
"""Request URL.
This extends the FuturesSession request method to calculate a response
time metric to each request.
It is taken (almost) directly from the following StackOverflow answer:
https://github.com/ross/requests-futures#working-in-the-background
Keyword Arguments:
self -- This object.
method -- String containing method desired for request.
url -- String containing URL for request.
hooks -- Dictionary containing hooks to execute after
request finishes.
args -- Arguments.
kwargs -- Keyword arguments.
Return Value:
Request object.
"""
# Record the start time for the request.
start = monotonic()
def response_time(resp, *args, **kwargs):
"""Response Time Hook.
Keyword Arguments:
resp -- Response object.
args -- Arguments.
kwargs -- Keyword arguments.
Return Value:
N/A
"""
resp.elapsed = monotonic() - start
return
# Install hook to execute when response completes.
# Make sure that the time measurement hook is first, so we will not
# track any later hook's execution time.
try:
if isinstance(hooks['response'], list):
hooks['response'].insert(0, response_time)
elif isinstance(hooks['response'], tuple):
# Convert tuple to list and insert time measurement hook first.
hooks['response'] = list(hooks['response'])
hooks['response'].insert(0, response_time)
else:
# Must have previously contained a single hook function,
# so convert to list.
hooks['response'] = [response_time, hooks['response']]
except KeyError:
# No response hook was already defined, so install it ourselves.
hooks['response'] = [response_time]
return super(SherlockFuturesSession, self).request(method,
url,
hooks=hooks,
*args, **kwargs)
def get_response(request_future, error_type, social_network):
# Default for Response object if some failure occurs.
response = None
error_context = "General Unknown Error"
expection_text = None
try:
response = request_future.result()
if response.status_code:
# Status code exists in response object
error_context = None
except requests.exceptions.HTTPError as errh:
error_context = "HTTP Error"
expection_text = str(errh)
except requests.exceptions.ProxyError as errp:
error_context = "Proxy Error"
expection_text = str(errp)
except requests.exceptions.ConnectionError as errc:
error_context = "Error Connecting"
expection_text = str(errc)
except requests.exceptions.Timeout as errt:
error_context = "Timeout Error"
expection_text = str(errt)
except requests.exceptions.RequestException as err:
error_context = "Unknown Error"
expection_text = str(err)
return response, error_context, expection_text
def sherlock(username, site_data, query_notify,
tor=False, unique_tor=False,
proxy=None, timeout=None):
"""Run Sherlock Analysis.
Checks for existence of username on various social media sites.
Keyword Arguments:
username -- String indicating username that report
should be created against.
site_data -- Dictionary containing all of the site data.
query_notify -- Object with base type of QueryNotify().
This will be used to notify the caller about
query results.
tor -- Boolean indicating whether to use a tor circuit for the requests.
unique_tor -- Boolean indicating whether to use a new tor circuit for each request.
proxy -- String indicating the proxy URL
timeout -- Time in seconds to wait before timing out request.
Default is no timeout.
Return Value:
Dictionary containing results from report. Key of dictionary is the name
of the social network site, and the value is another dictionary with
the following keys:
url_main: URL of main site.
url_user: URL of user on site (if account exists).
status: QueryResult() object indicating results of test for
account existence.
http_status: HTTP status code of query which checked for existence on
site.
response_text: Text that came back from request. May be None if
there was an HTTP error when checking for existence.
"""
# Notify caller that we are starting the query.
query_notify.start(username)
# Create session based on request methodology
if tor or unique_tor:
# Requests using Tor obfuscation
underlying_request = TorRequest()
underlying_session = underlying_request.session
else:
# Normal requests
underlying_session = requests.session()
underlying_request = requests.Request()
# Limit number of workers to 20.
# This is probably vastly overkill.
if len(site_data) >= 20:
max_workers=20
else:
max_workers=len(site_data)
# Create multi-threaded session for all requests.
session = SherlockFuturesSession(max_workers=max_workers,
session=underlying_session)
# Results from analysis of all sites
results_total = {}
# First create futures for all requests. This allows for the requests to run in parallel
for social_network, net_info in site_data.items():
# Results from analysis of this specific site
results_site = {}
# Record URL of main site
results_site['url_main'] = net_info.get("urlMain")
# A user agent is needed because some sites don't return the correct
# information since they think that we are bots (Which we actually are...)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
}
if "headers" in net_info:
# Override/append any extra headers required by a given site.
headers.update(net_info["headers"])
# URL of user on site (if it exists)
url = net_info["url"].format(username)
# Don't make request if username is invalid for the site
regex_check = net_info.get("regexCheck")
if regex_check and re.search(regex_check, username) is None:
# No need to do the check at the site: this user name is not allowed.
results_site['status'] = QueryResult(username,
social_network,
url,
QueryStatus.ILLEGAL)
results_site["url_user"] = ""
results_site['http_status'] = ""
results_site['response_text'] = ""
query_notify.update(results_site['status'])
else:
# URL of user on site (if it exists)
results_site["url_user"] = url
url_probe = net_info.get("urlProbe")
if url_probe is None:
# Probe URL is normal one seen by people out on the web.
url_probe = url
else:
# There is a special URL for probing existence separate
# from where the user profile normally can be found.
url_probe = url_probe.format(username)
if (net_info["errorType"] == 'status_code' and
net_info.get("request_head_only", True) == True):
# In most cases when we are detecting by status code,
# it is not necessary to get the entire body: we can
# detect fine with just the HEAD response.
request_method = session.head
else:
# Either this detect method needs the content associated
# with the GET response, or this specific website will
# not respond properly unless we request the whole page.
request_method = session.get
if net_info["errorType"] == "response_url":
# Site forwards request to a different URL if username not
# found. Disallow the redirect so we can capture the
# http status from the original URL request.
allow_redirects = False
else:
# Allow whatever redirect that the site wants to do.
# The final result of the request will be what is available.
allow_redirects = True
# This future starts running the request in a new thread, doesn't block the main thread
if proxy is not None:
proxies = {"http": proxy, "https": proxy}
future = request_method(url=url_probe, headers=headers,
proxies=proxies,
allow_redirects=allow_redirects,
timeout=timeout
)
else:
future = request_method(url=url_probe, headers=headers,
allow_redirects=allow_redirects,
timeout=timeout
)
# Store future in data for access later
net_info["request_future"] = future
# Reset identify for tor (if needed)
if unique_tor:
underlying_request.reset_identity()
# Add this site's results into final dictionary with all of the other results.
results_total[social_network] = results_site
# Open the file containing account links
# Core logic: If tor requests, make them here. If multi-threaded requests, wait for responses
for social_network, net_info in site_data.items():
# Retrieve results again
results_site = results_total.get(social_network)
# Retrieve other site information again
url = results_site.get("url_user")
status = results_site.get("status")
if status is not None:
# We have already determined the user doesn't exist here
continue
# Get the expected error type
error_type = net_info["errorType"]
# Retrieve future and ensure it has finished
future = net_info["request_future"]
r, error_text, expection_text = get_response(request_future=future,
error_type=error_type,
social_network=social_network)
# Get response time for response of our request.
try:
response_time = r.elapsed
except AttributeError:
response_time = None
# Attempt to get request information
try:
http_status = r.status_code
except:
http_status = "?"
try:
response_text = r.text.encode(r.encoding)
except:
response_text = ""
if error_text is not None:
result = QueryResult(username,
social_network,
url,
QueryStatus.UNKNOWN,
query_time=response_time,
context=error_text)
elif error_type == "message":
# error_flag True denotes no error found in the HTML
# error_flag False denotes error found in the HTML
error_flag = True
errors=net_info.get("errorMsg")
# errors will hold the error message
# it can be string or list
# by insinstance method we can detect that
# and handle the case for strings as normal procedure
# and if its list we can iterate the errors
if isinstance(errors,str):
# Checks if the error message is in the HTML
# if error is present we will set flag to False
if errors in r.text:
error_flag = False
else:
# If it's list, it will iterate all the error message
for error in errors:
if error in r.text:
error_flag = False
break
if error_flag:
result = QueryResult(username,
social_network,
url,
QueryStatus.CLAIMED,
query_time=response_time)
else:
result = QueryResult(username,
social_network,
url,
QueryStatus.AVAILABLE,
query_time=response_time)
elif error_type == "status_code":
# Checks if the status code of the response is 2XX
if not r.status_code >= 300 or r.status_code < 200:
result = QueryResult(username,
social_network,
url,
QueryStatus.CLAIMED,
query_time=response_time)
else:
result = QueryResult(username,
social_network,
url,
QueryStatus.AVAILABLE,
query_time=response_time)
elif error_type == "response_url":
# For this detection method, we have turned off the redirect.
# So, there is no need to check the response URL: it will always
# match the request. Instead, we will ensure that the response
# code indicates that the request was successful (i.e. no 404, or
# forward to some odd redirect).
if 200 <= r.status_code < 300:
result = QueryResult(username,
social_network,
url,
QueryStatus.CLAIMED,
query_time=response_time)
else:
result = QueryResult(username,
social_network,
url,
QueryStatus.AVAILABLE,
query_time=response_time)
else:
# It should be impossible to ever get here...
raise ValueError(f"Unknown Error Type '{error_type}' for "
f"site '{social_network}'")
# Notify caller about results of query.
query_notify.update(result)
# Save status of request
results_site['status'] = result
# Save results from request
results_site['http_status'] = http_status
results_site['response_text'] = response_text
# Add this site's results into final dictionary with all of the other results.
results_total[social_network] = results_site
# Notify caller that all queries are finished.
query_notify.finish()
return results_total
def timeout_check(value):
"""Check Timeout Argument.
Checks timeout for validity.
Keyword Arguments:
value -- Time in seconds to wait before timing out request.
Return Value:
Floating point number representing the time (in seconds) that should be
used for the timeout.
NOTE: Will raise an exception if the timeout in invalid.
"""
from argparse import ArgumentTypeError
try:
timeout = float(value)
except:
raise ArgumentTypeError(f"Timeout '{value}' must be a number.")
if timeout <= 0:
raise ArgumentTypeError(f"Timeout '{value}' must be greater than 0.0s.")
return timeout
def main():
version_string = f"%(prog)s {__version__}\n" + \
f"{requests.__description__}: {requests.__version__}\n" + \
f"Python: {platform.python_version()}"
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter,
description=f"{module_name} (Version {__version__})"
)
parser.add_argument("--version",
action="version", version=version_string,
help="Display version information and dependencies."
)
parser.add_argument("--verbose", "-v", "-d", "--debug",
action="store_true", dest="verbose", default=False,
help="Display extra debugging information and metrics."
)
parser.add_argument("--folderoutput", "-fo", dest="folderoutput",
help="If using multiple usernames, the output of the results will be saved to this folder."
)
parser.add_argument("--output", "-o", dest="output",
help="If using single username, the output of the result will be saved to this file."
)
parser.add_argument("--tor", "-t",
action="store_true", dest="tor", default=False,
help="Make requests over Tor; increases runtime; requires Tor to be installed and in system path.")
parser.add_argument("--unique-tor", "-u",
action="store_true", dest="unique_tor", default=False,
help="Make requests over Tor with new Tor circuit after each request; increases runtime; requires Tor to be installed and in system path.")
parser.add_argument("--csv",
action="store_true", dest="csv", default=False,
help="Create Comma-Separated Values (CSV) File."
)
parser.add_argument("--site",
action="append", metavar='SITE_NAME',
dest="site_list", default=None,
help="Limit analysis to just the listed sites. Add multiple options to specify more than one site."
)
parser.add_argument("--proxy", "-p", metavar='PROXY_URL',
action="store", dest="proxy", default=None,
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080"
)
parser.add_argument("--json", "-j", metavar="JSON_FILE",
dest="json_file", default=None,
help="Load data from a JSON file or an online, valid, JSON file.")
parser.add_argument("--timeout",
action="store", metavar='TIMEOUT',
dest="timeout", type=timeout_check, default=None,
help="Time (in seconds) to wait for response to requests. "
"Default timeout is infinity. "
"A longer timeout will be more likely to get results from slow sites. "
"On the other hand, this may cause a long delay to gather all results."
)
parser.add_argument("--print-all",
action="store_true", dest="print_all",
help="Output sites where the username was not found."
)
parser.add_argument("--print-found",
action="store_false", dest="print_all", default=False,
help="Output sites where the username was found."
)
parser.add_argument("--no-color",
action="store_true", dest="no_color", default=False,
help="Don't color terminal output"
)
parser.add_argument("username",
nargs='+', metavar='USERNAMES',
action="store",
help="One or more usernames to check with social networks."
)
parser.add_argument("--browse", "-b",
action="store_true", dest="browse", default=False,
help="Browse to all results on default browser.")
parser.add_argument("--local", "-l",
action="store_true", default=False,
help="Force the use of the local data.json file.")
args = parser.parse_args()
# Check for newer version of Sherlock. If it exists, let the user know about it
try:
r = requests.get("https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock/sherlock.py")
remote_version = str(re.findall('__version__ = "(.*)"', r.text)[0])
local_version = __version__
if remote_version != local_version:
print("Update Available!\n" +
f"You are running version {local_version}. Version {remote_version} is available at https://git.io/sherlock")
except Exception as error:
print(f"A problem occured while checking for an update: {error}")
# Argument check
# TODO regex check on args.proxy
if args.tor and (args.proxy is not None):
raise Exception("Tor and Proxy cannot be set at the same time.")
# Make prompts
if args.proxy is not None:
print("Using the proxy: " + args.proxy)
if args.tor or args.unique_tor:
print("Using Tor to make requests")
print("Warning: some websites might refuse connecting over Tor, so note that using this option might increase connection errors.")
# Check if both output methods are entered as input.
if args.output is not None and args.folderoutput is not None:
print("You can only use one of the output methods.")
sys.exit(1)
# Check validity for single username output.
if args.output is not None and len(args.username) != 1:
print("You can only use --output with a single username")
sys.exit(1)
# Create object with all information about sites we are aware of.
try:
if args.local:
sites = SitesInformation(os.path.join(os.path.dirname(__file__), 'resources/data.json'))
else:
sites = SitesInformation(args.json_file)
except Exception as error:
print(f"ERROR: {error}")
sys.exit(1)
# Create original dictionary from SitesInformation() object.
# Eventually, the rest of the code will be updated to use the new object
# directly, but this will glue the two pieces together.
site_data_all = {}
for site in sites:
site_data_all[site.name] = site.information
if args.site_list is None:
# Not desired to look at a sub-set of sites
site_data = site_data_all
else:
# User desires to selectively run queries on a sub-set of the site list.
# Make sure that the sites are supported & build up pruned site database.
site_data = {}
site_missing = []
for site in args.site_list:
counter = 0
for existing_site in site_data_all:
if site.lower() == existing_site.lower():
site_data[existing_site] = site_data_all[existing_site]
counter += 1
if counter == 0:
# Build up list of sites not supported for future error message.
site_missing.append(f"'{site}'")
if site_missing:
print(f"Error: Desired sites not found: {', '.join(site_missing)}.")
if not site_data:
sys.exit(1)
# Create notify object for query results.
query_notify = QueryNotifyPrint(result=None,
verbose=args.verbose,
print_all=args.print_all,
color=not args.no_color)
# Run report on all specified users.
for username in args.username:
results = sherlock(username,
site_data,
query_notify,
tor=args.tor,
unique_tor=args.unique_tor,
proxy=args.proxy,
timeout=args.timeout)
if args.output:
result_file = args.output
elif args.folderoutput:
# The usernames results should be stored in a targeted folder.
# If the folder doesn't exist, create it first
os.makedirs(args.folderoutput, exist_ok=True)
result_file = os.path.join(args.folderoutput, f"{username}.txt")
else:
result_file = f"{username}.txt"
with open(result_file, "w", encoding="utf-8") as file:
exists_counter = 0
for website_name in results:
dictionary = results[website_name]
if dictionary.get("status").status == QueryStatus.CLAIMED:
exists_counter += 1
file.write(dictionary["url_user"] + "\n")
file.write(f"Total Websites Username Detected On : {exists_counter}\n")
if args.csv:
result_file = f"{username}.csv"
if args.folderoutput:
# The usernames results should be stored in a targeted folder.
# If the folder doesn't exist, create it first
os.makedirs(args.folderoutput, exist_ok=True)
result_file = os.path.join(args.folderoutput, result_file)
with open(result_file, "w", newline='', encoding="utf-8") as csv_report:
writer = csv.writer(csv_report)
writer.writerow(['username',
'name',
'url_main',
'url_user',
'exists',
'http_status',
'response_time_s'
]
)
for site in results:
response_time_s = results[site]['status'].query_time
if response_time_s is None:
response_time_s = ""
writer.writerow([username,
site,
results[site]['url_main'],
results[site]['url_user'],
str(results[site]['status'].status),
results[site]['http_status'],
response_time_s
]
)
print()
if __name__ == "__main__":
main()
From looking at a sample output of sherlock.py at https://asciinema.org/a/223115, I don't see why you're doing any splitting. You should just search for [*] Checking username and get everything from there, to skip over the picture of Sherlock.
import re
username = request.json['username']
output = subprocess.check_output("python3 sherlock.py "+username, shell=True)
lines = re.sub(r'^.*?(?=\[\*\] Checking username)', '', flags=re.DOTALL)
return jsonify({"msg": lines})
I'm looking at parsing out just the most recent reply/message from an email thread as part of a zap.
I've found this link but how to I use it within a Zap? https://github.com/zapier/email-reply-parser
i.e. when I pick up a thread from gmail how do I just extract the most recent message?
Is this possible in Code by Zapier and if so how?
E.g.
Input:
Yes that is fine, I will email you in the morning.
On Fri, Nov 16, 2012 at 1:48 PM, Zapier wrote:
Our support team just commented on your open Ticket:
"Hi Royce, can we chat in the morning about your question?"
Ouput: i.e. the parsed email:
Yes that is fine, I will email you in the morning.
First off: it's not possible to use that in a code step directly. Python code steps don't have access to external packages.
That said, that package is just Python code, and there's nothing stopping you copying all of the important code into the Code step and using it that way.
It's worth noting that the linked code is pretty old and looks to be unmaintained, so it's unlikely to work without modifications.
I had a go at adapting this https://github.com/zapier/email-reply-parser which seemed to work as well.
"""
email_reply_parser is a python library port of GitHub's Email Reply Parser.
For more information, visit https://github.com/zapier/email-reply-parser
"""
import re
class EmailReplyParser(object):
""" Represents a email message that is parsed.
"""
#staticmethod
def read(text):
""" Factory method that splits email into list of fragments
text - A string email body
Returns an EmailMessage instance
"""
return EmailMessage(text).read()
#staticmethod
def parse_reply(text):
""" Provides the reply portion of email.
text - A string email body
Returns reply body message
"""
return EmailReplyParser.read(text).reply
class EmailMessage(object):
""" An email message represents a parsed email body.
"""
SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})')
QUOTE_HDR_REGEX = re.compile('On.*wrote:$')
QUOTED_REGEX = re.compile(r'(>+)')
HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
_MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'
MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL)
def __init__(self, text):
self.fragments = []
self.fragment = None
self.text = text.replace('\r\n', '\n')
self.found_visible = False
def read(self):
""" Creates new fragment for each line
and labels as a signature, quote, or hidden.
Returns EmailMessage instance
"""
self.found_visible = False
is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text)
if is_multi_quote_header:
self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text)
# Fix any outlook style replies, with the reply immediately above the signature boundary line
# See email_2_2.txt for an example
self.text = re.sub('([^\n])(?=\n ?[_-]{7,})', '\\1\n', self.text, re.MULTILINE)
self.lines = self.text.split('\n')
self.lines.reverse()
for line in self.lines:
self._scan_line(line)
self._finish_fragment()
self.fragments.reverse()
return self
#property
def reply(self):
""" Captures reply message within email
"""
reply = []
for f in self.fragments:
if not (f.hidden or f.quoted):
reply.append(f.content)
return '\n'.join(reply)
def _scan_line(self, line):
""" Reviews each line in email message and determines fragment type
line - a row of text from an email message
"""
is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None
is_quoted = self.QUOTED_REGEX.match(line) is not None
is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None
if self.fragment and len(line.strip()) == 0:
if self.SIG_REGEX.match(self.fragment.lines[-1].strip()):
self.fragment.signature = True
self._finish_fragment()
if self.fragment \
and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or
(self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))):
self.fragment.lines.append(line)
else:
self._finish_fragment()
self.fragment = Fragment(is_quoted, line, headers=is_header)
def quote_header(self, line):
""" Determines whether line is part of a quoted area
line - a row of the email message
Returns True or False
"""
return self.QUOTE_HDR_REGEX.match(line[::-1]) is not None
def _finish_fragment(self):
""" Creates fragment
"""
if self.fragment:
self.fragment.finish()
if self.fragment.headers:
# Regardless of what's been seen to this point, if we encounter a headers fragment,
# all the previous fragments should be marked hidden and found_visible set to False.
self.found_visible = False
for f in self.fragments:
f.hidden = True
if not self.found_visible:
if self.fragment.quoted \
or self.fragment.headers \
or self.fragment.signature \
or (len(self.fragment.content.strip()) == 0):
self.fragment.hidden = True
else:
self.found_visible = True
self.fragments.append(self.fragment)
self.fragment = None
class Fragment(object):
""" A Fragment is a part of
an Email Message, labeling each part.
"""
def __init__(self, quoted, first_line, headers=False):
self.signature = False
self.headers = headers
self.hidden = False
self.quoted = quoted
self._content = None
self.lines = [first_line]
def finish(self):
""" Creates block of content with lines
belonging to fragment.
"""
self.lines.reverse()
self._content = '\n'.join(self.lines)
self.lines = None
#property
def content(self):
return self._content.strip()
return {'emailstring': EmailReplyParser.parse_reply(input_data['body'])}
I use the blogger2wordpress python script that Google released back in 2010 (https://code.google.com/archive/p/google-blog-converters-appengine/downloads), to convert a 95mb blogger export file to wordpress wxr format.
However, the script has this code:
#!/usr/bin/env python
# Copyright 2008 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path
import logging
import re
import sys
import time
from xml.sax.saxutils import unescape
import BeautifulSoup
import gdata
from gdata import atom
import iso8601
import wordpress
__author__ = 'JJ Lueck (EMAIL#gmail.com)'
###########################
# Constants
###########################
BLOGGER_URL = 'http://www.blogger.com/'
BLOGGER_NS = 'http://www.blogger.com/atom/ns#'
KIND_SCHEME = 'http://schemas.google.com/g/2005#kind'
YOUTUBE_RE = re.compile('http://www.youtube.com/v/([^&]+)&?.*')
YOUTUBE_FMT = r'[youtube=http://www.youtube.com/watch?v=\1]'
GOOGLEVIDEO_RE = re.compile('(http://video.google.com/googleplayer.swf.*)')
GOOGLEVIDEO_FMT = r'[googlevideo=\1]'
DAILYMOTION_RE = re.compile('http://www.dailymotion.com/swf/(.*)')
DAILYMOTION_FMT = r'[dailymotion id=\1]'
###########################
# Translation class
###########################
class Blogger2Wordpress(object):
"""Performs the translation of a Blogger export document to WordPress WXR."""
def __init__(self, doc):
"""Constructs a translator for a Blogger export file.
Args:
doc: The WXR file as a string
"""
# Ensure UTF8 chars get through correctly by ensuring we have a
# compliant UTF8 input doc.
self.doc = doc.decode('utf-8', 'replace').encode('utf-8')
# Read the incoming document as a GData Atom feed.
self.feed = atom.FeedFromString(self.doc)
self.next_id = 1
def Translate(self):
"""Performs the actual translation to WordPress WXR export format.
Returns:
A WordPress WXR export document as a string, or None on error.
"""
# Create the top-level document and the channel associated with it.
channel = wordpress.Channel(
title = self.feed.title.text,
link = self.feed.GetAlternateLink().href,
base_blog_url = self.feed.GetAlternateLink().href,
pubDate = self._ConvertPubDate(self.feed.updated.text))
posts_map = {}
for entry in self.feed.entry:
# Grab the information about the entry kind
entry_kind = ""
for category in entry.category:
if category.scheme == KIND_SCHEME:
entry_kind = category.term
if entry_kind.endswith("#comment"):
# This entry will be a comment, grab the post that it goes to
in_reply_to = entry.FindExtensions('in-reply-to')
post_item = None
# Check to see that the comment has a corresponding post entry
if in_reply_to:
post_id = self._ParsePostId(in_reply_to[0].attributes['ref'])
post_item = posts_map.get(post_id, None)
# Found the post for the comment, add the commment to it
if post_item:
# The author email may not be included in the file
author_email = ''
if entry.author[0].email:
author_email = entry.author[0].email.text
# Same for the the author's url
author_url = ''
if entry.author[0].uri:
author_url = entry.author[0].uri.text
post_item.comments.append(wordpress.Comment(
comment_id = self._GetNextId(),
author = entry.author[0].name.text,
author_email = author_email,
author_url = author_url,
date = self._ConvertDate(entry.published.text),
content = self._ConvertContent(entry.content.text)))
elif entry_kind.endswith('#post'):
# This entry will be a post
post_item = self._ConvertEntry(entry, False)
posts_map[self._ParsePostId(entry.id.text)] = post_item
channel.items.append(post_item)
elif entry_kind.endswith('#page'):
# This entry will be a static page
page_item = self._ConvertEntry(entry, True)
posts_map[self._ParsePageId(entry.id.text)] = page_item
channel.items.append(page_item)
wxr = wordpress.WordPressWxr(channel=channel)
return wxr.WriteXml()
def _ConvertEntry(self, entry, is_page):
"""Converts the contents of an Atom entry into a WXR post Item element."""
# A post may have an empty title, in which case the text element is None.
title = ''
if entry.title.text:
title = entry.title.text
# Check here to see if the entry points to a draft or regular post
status = 'publish'
if entry.control and entry.control.draft:
status = 'draft'
# If no link is present in the Blogger entry, just link
if entry.GetAlternateLink():
link = entry.GetAlternateLink().href
else:
link = BLOGGER_URL
# Declare whether this is a post of a page
post_type = 'post'
if is_page:
post_type = 'page'
blogger_blog = ''
blogger_permalink = ''
if entry.GetAlternateLink():
blogger_path_full = entry.GetAlternateLink().href.replace('http://', '')
blogger_blog = blogger_path_full.split('/')[0]
blogger_permalink = blogger_path_full[len(blogger_blog):]
# Create the actual item element
post_item = wordpress.Item(
title = title,
link = link,
pubDate = self._ConvertPubDate(entry.published.text),
creator = entry.author[0].name.text,
content = self._ConvertContent(entry.content.text),
post_id = self._GetNextId(),
post_date = self._ConvertDate(entry.published.text),
status = status,
post_type = post_type,
blogger_blog = blogger_blog,
blogger_permalink = blogger_permalink,
blogger_author = entry.author[0].name.text)
# Convert the categories which specify labels into wordpress labels
for category in entry.category:
if category.scheme == BLOGGER_NS:
post_item.labels.append(category.term)
return post_item
def _ConvertContent(self, text):
"""Unescapes the post/comment text body and replaces video content.
All <object> and <embed> tags in the post that relate to video must be
changed into the WordPress tags for embedding video,
e.g. [youtube=http://www.youtube.com/...]
If no text is provided, the empty string is returned.
"""
if not text:
return ''
# First unescape all XML tags as they'll be escaped by the XML emitter
content = unescape(text)
# Use an HTML parser on the body to look for video content
content_tree = BeautifulSoup.BeautifulSoup(content)
# Find the object tag
objs = content_tree.findAll('object')
for obj_tag in objs:
# Find the param tag within which contains the URL to the movie
param_tag = obj_tag.find('param', { 'name': 'movie' })
if not param_tag:
continue
# Get the video URL
video = param_tag.attrMap.get('value', None)
if not video:
continue
# Convert the video URL if necessary
video = YOUTUBE_RE.subn(YOUTUBE_FMT, video)[0]
video = GOOGLEVIDEO_RE.subn(GOOGLEVIDEO_FMT, video)[0]
video = DAILYMOTION_RE.subn(DAILYMOTION_FMT, video)[0]
# Replace the portion of the contents with the video
obj_tag.replaceWith(video)
return str(content_tree)
def _ConvertPubDate(self, date):
"""Translates to a pubDate element's time/date format."""
date_tuple = iso8601.parse_date(date)
return date_tuple.strftime('%a, %d %b %Y %H:%M:%S %z')
def _ConvertDate(self, date):
"""Translates to a wordpress date element's time/date format."""
date_tuple = iso8601.parse_date(date)
return date_tuple.strftime('%Y-%m-%d %H:%M:%S')
def _GetNextId(self):
"""Returns the next identifier to use in the export document as a string."""
next_id = self.next_id;
self.next_id += 1
return str(next_id)
def _ParsePostId(self, text):
"""Extracts the post identifier from a Blogger entry ID."""
matcher = re.compile('post-(\d+)')
matches = matcher.search(text)
return matches.group(1)
def _ParsePageId(self, text):
"""Extracts the page identifier from a Blogger entry ID."""
matcher = re.compile('page-(\d+)')
matches = matcher.search(text)
return matches.group(1)
if __name__ == '__main__':
if len(sys.argv) <= 1:
print 'Usage: %s <blogger_export_file>' % os.path.basename(sys.argv[0])
print
print ' Outputs the converted WordPress export file to standard out.'
sys.exit(-1)
wp_xml_file = open(sys.argv[1])
wp_xml_doc = wp_xml_file.read()
translator = Blogger2Wordpress(wp_xml_doc)
print translator.Translate()
wp_xml_file.close()
This scripts outputs the wxr file in the terminal window which is useless for me when the import file has tons of entries.
As I am not familiar with python, how can I modify the script to output the data into a .xml file?
Edit:
I did changed the end of the script to:
wp_xml_file = open(sys.argv[1])
wp_xml_doc = wp_xml_file.read()
translator = Blogger2Wordpress(wp_xml_doc)
print translator.Translate()
fh = open("testoutput.xml", "w")
fh.write(wp_xml_doc);
fh.close();
wp_xml_file.close()
But the produced file is an "invalid wxr file" :/
Can anybody help? Thanks!
Quick and dirty answer:
Output to the stdout is normal behaviour.
You might want to redirect it to a file for instance:
python2 blogger2wordpress your_blogger_export_file > backup
The output will be saved in the file named backup.
Or you can replace print translator.Translate() by
with open('output_file', 'w') as fd:
fd.write(translator.Translate())
This should do the trick (haven't tried).
I have a script here with some basic funtions:
Function 1 - wget, opens a webpage and saves it to a local variable then closes.
Function 2 - scrapes this webpage for md5 hash values.
Function 3 - takes the hash values and cracks them using a dictionary of commonly used passwords.
My problem is getting my output from Function 2 and inserting it into Function 3. This is partly due to the output from Function 2 being a list and Function 3 is looking for just hash values.
You guys will most likely be able to understand more from reading my code, below is my code so far.
import sys, hashlib, re, urllib
def wget(url): # could import webpage_get and use wget() from there instead
'''Read the contents of a webpage from a specified URL'''
print '[+]---------------------------------------------------------------------------- ' #CHANGE THIS
# open URL
webpage = urllib.urlopen(url) # opens url like a file
# get page contents
page_contents = webpage.read() # reads content of webpage
return page_contents
page_contents = webpage.close() # close webpage
def findmd5(text):
'''Find all md5 hash values'''
md5value = re.findall(r'([a-fA-F\d]{32})', text)
count = len(md5value)
print "[+] Total number of md5 hash values found: %s" % count
for x in md5value:
print x
def dict_attack(passwd_hash):
dic = ['123','1234','12345','123456','1234567','12345678','password','qwerty','abc','abcd','abc123','111111','monkey','arsenal','letmein','trustno1','dragon','baseball','superman','iloveyou','starwars','montypython','cheese','123123','football','password','batman']
passwd_found = False
for value in dic:
hashvalue = hashlib.md5(value).hexdigest()
if hashvalue == passwd_hash:
passwd_found = True
recovered_password = value
if passwd_found == True:
print '[+] Password recovered: %s'% (recovered_password)
else:
print '[-] Password not recovered'
def main():
# temp testing url argument
sys.argv.append('URL HERE!')
# Check args
if len(sys.argv) != 2:
print '[-] Usage: email_analysis URL/filename'
return
#call functions
try:
print '[+] md5 values found: '
print findmd5(wget(sys.argv[1]))
print '[+] Cracking hash values: '
except IOError:
print 'Error'
if __name__ == '__main__':
main()
Any help is greatly appreciated!
wget: Set return statement as last statement.
findmd5: Changed from printing it's results, to returning them to a variable in main.
main: added in for loop to iterate over found hashes and apply dict_attack to each value.
I did however not build in any break or stop condition, so even if found, the program will continue running. It will however still print the found result.
import sys, hashlib, re, urllib
def wget(url): # could import webpage_get and use wget() from there instead
'''Read the contents of a webpage from a specified URL'''
print ('[+]---------------------------------------------------------------------------- ') #CHANGE THIS
# open URL
webpage = urllib.urlopen(url) # opens url like a file
# get page contents
page_contents = webpage.read() # reads content of webpage
page_contents = webpage.close() # close webpage
return page_contents
def findmd5(text):
'''Find all md5 hash values'''
md5value = re.findall(r'([a-fA-F\d]{32})', text)
count = len(md5value)
print ("[+] Total number of md5 hash values found: %s" % count)
return md5value
def dict_attack(passwd_hash):
dic = ['123','1234','12345','123456','1234567','12345678','password','qwerty','abc','abcd','abc123','111111','monkey','arsenal','letmein','trustno1','dragon','baseball','superman','iloveyou','starwars','montypython','cheese','123123','football','password','batman']
passwd_found = False
for value in dic:
hashvalue = hashlib.md5(value).hexdigest()
if hashvalue == passwd_hash:
passwd_found = True
recovered_password = value
if passwd_found == True:
print ('[+] Password recovered: %s'% (recovered_password))
else:
print ('[-] Password not recovered')
def main():
# temp testing url argument
sys.argv.append('URL HERE!')
# Check args
if len(sys.argv) != 2:
print ('[-] Usage: email_analysis URL/filename')
return
#call functions
try:
md5Values = findmd5(wget(sys.argv[1]))
for md5value in md5values:
dict_attack(md5value)
print ('[+] Cracking hash values: ')
except IOError:
print ('Error')
if __name__ == '__main__':
main()
Billy, return a a list of the hashes found, instead of printing them (it looks like you are thinking like it was bash, but you don't need to "print" the output of a function, as you did in bash, in python you can literally return an array with the elements found).
Your regexp for the hash uses \d, but that includes - as well, it might bring something that is not a MD5 hash.
I'd like to identify a method to attain the Worksheet ID within the URL for each of the worksheets within a Google Spreadsheet Workbook. For example, the worksheet id for 'sheet2' of this workbook is '1244369280' , since it's url is https://docs.google.com/spreadsheets/d/1yd8qTYjRns4_OT8PbsZzH0zajvzguKS79dq6j--hnTs/edit#gid=1244369280
One method I've found is to pull the XML of a Google Spreadsheet, since according to this question, the only way to get the Worksheet ID is to stream down the XML of a worksheet, but the example is in Javascript and I need to do this in Python
This is the Javascript Code that I'd like to execute in Python:
Dim worksheetFeed As WorksheetFeed
Dim query As WorksheetQuery
Dim worksheet As WorksheetEntry
Dim output As New MemoryStream
Dim xml As String
Dim gid As String = String.Empty
Try
_service = New Spreadsheets.SpreadsheetsService("ServiceName")
_service.setUserCredentials(UserId, Password)
query = New WorksheetQuery(feedUrl)
worksheetFeed = _service.Query(query)
worksheet = worksheetFeed.Entries(0)
' Save worksheet feed to memory stream so we can
' get the xml returned from the feed url and look for
' the gid. Gid allows us to download the specific worksheet tab
Using output
worksheet.SaveToXml(output)
End Using
xml = Encoding.ASCII.GetString(output.ToArray())
It seems that the best way to get the XML from a Google Spreadsheet is using Gdata, so I've downloaded GData and tried the Google Spreadsheet example with my credentials.
See below
#!/usr/bin/python
#
# Copyright (C) 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__author__ = 'api.laurabeth#gmail.com (Laura Beth Lincoln)'
try:
from xml.etree import ElementTree
except ImportError:
from elementtree import ElementTree
import gdata.spreadsheet.service
import gdata.service
import atom.service
import gdata.spreadsheet
import atom
import getopt
import sys
import string
class SimpleCRUD:
def __init__(self, email, password):
self.gd_client = gdata.spreadsheet.service.SpreadsheetsService()
self.gd_client.email = 'chris#curalate.com'
self.gd_client.password = 'jkjkdioerzumawya'
self.gd_client.source = 'Spreadsheets GData Sample'
self.gd_client.ProgrammaticLogin()
self.curr_key = ''
self.curr_wksht_id = ''
self.list_feed = None
def _PromptForSpreadsheet(self):
# Get the list of spreadsheets
feed = self.gd_client.GetSpreadsheetsFeed()
self._PrintFeed(feed)
input = raw_input('\nSelection: ')
id_parts = feed.entry[string.atoi(input)].id.text.split('/')
self.curr_key = id_parts[len(id_parts) - 1]
def _PromptForWorksheet(self):
# Get the list of worksheets
feed = self.gd_client.GetWorksheetsFeed(self.curr_key)
self._PrintFeed(feed)
input = raw_input('\nSelection: ')
id_parts = feed.entry[string.atoi(input)].id.text.split('/')
self.curr_wksht_id = id_parts[len(id_parts) - 1]
def _PromptForCellsAction(self):
print ('dump\n'
'update {row} {col} {input_value}\n'
'\n')
input = raw_input('Command: ')
command = input.split(' ', 1)
if command[0] == 'dump':
self._CellsGetAction()
elif command[0] == 'update':
parsed = command[1].split(' ', 2)
if len(parsed) == 3:
self._CellsUpdateAction(parsed[0], parsed[1], parsed[2])
else:
self._CellsUpdateAction(parsed[0], parsed[1], '')
else:
self._InvalidCommandError(input)
def _PromptForListAction(self):
print ('dump\n'
'insert {row_data} (example: insert label=content)\n'
'update {row_index} {row_data}\n'
'delete {row_index}\n'
'Note: No uppercase letters in column names!\n'
'\n')
input = raw_input('Command: ')
command = input.split(' ' , 1)
if command[0] == 'dump':
self._ListGetAction()
elif command[0] == 'insert':
self._ListInsertAction(command[1])
elif command[0] == 'update':
parsed = command[1].split(' ', 1)
self._ListUpdateAction(parsed[0], parsed[1])
elif command[0] == 'delete':
self._ListDeleteAction(command[1])
else:
self._InvalidCommandError(input)
def _CellsGetAction(self):
# Get the feed of cells
feed = self.gd_client.GetCellsFeed(self.curr_key, self.curr_wksht_id)
self._PrintFeed(feed)
def _CellsUpdateAction(self, row, col, inputValue):
entry = self.gd_client.UpdateCell(row=row, col=col, inputValue=inputValue,
key=self.curr_key, wksht_id=self.curr_wksht_id)
if isinstance(entry, gdata.spreadsheet.SpreadsheetsCell):
print 'Updated!'
def _ListGetAction(self):
# Get the list feed
self.list_feed = self.gd_client.GetListFeed(self.curr_key, self.curr_wksht_id)
self._PrintFeed(self.list_feed)
def _ListInsertAction(self, row_data):
entry = self.gd_client.InsertRow(self._StringToDictionary(row_data),
self.curr_key, self.curr_wksht_id)
if isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
print 'Inserted!'
def _ListUpdateAction(self, index, row_data):
self.list_feed = self.gd_client.GetListFeed(self.curr_key, self.curr_wksht_id)
entry = self.gd_client.UpdateRow(
self.list_feed.entry[string.atoi(index)],
self._StringToDictionary(row_data))
if isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
print 'Updated!'
def _ListDeleteAction(self, index):
self.list_feed = self.gd_client.GetListFeed(self.curr_key, self.curr_wksht_id)
self.gd_client.DeleteRow(self.list_feed.entry[string.atoi(index)])
print 'Deleted!'
def _StringToDictionary(self, row_data):
dict = {}
for param in row_data.split():
temp = param.split('=')
dict[temp[0]] = temp[1]
return dict
def _PrintFeed(self, feed):
for i, entry in enumerate(feed.entry):
if isinstance(feed, gdata.spreadsheet.SpreadsheetsCellsFeed):
print '%s %s\n' % (entry.title.text, entry.content.text)
elif isinstance(feed, gdata.spreadsheet.SpreadsheetsListFeed):
print '%s %s %s' % (i, entry.title.text, entry.content.text)
# Print this row's value for each column (the custom dictionary is
# built using the gsx: elements in the entry.)
print 'Contents:'
for key in entry.custom:
print ' %s: %s' % (key, entry.custom[key].text)
print '\n',
else:
print '%s %s\n' % (i, entry.title.text)
def _InvalidCommandError(self, input):
print 'Invalid input: %s\n' % (input)
def Run(self):
self._PromptForSpreadsheet()
self._PromptForWorksheet()
input = raw_input('cells or list? ')
if input == 'cells':
while True:
self._PromptForCellsAction()
elif input == 'list':
while True:
self._PromptForListAction()
def main():
# parse command line options
try:
opts, args = getopt.getopt(sys.argv[1:], "", ["user=", "pw="])
except getopt.error, msg:
print 'python spreadsheetExample.py --user [username] --pw [password] '
sys.exit(2)
user = 'fake#gmail.com'
pw = 'fakepassword'
key = ''
# Process options
for o, a in opts:
if o == "--user":
user = a
elif o == "--pw":
pw = a
if user == '' or pw == '':
print 'python spreadsheetExample.py --user [username] --pw [password] '
sys.exit(2)
sample = SimpleCRUD(user, pw)
sample.Run()
if __name__ == '__main__':
main()
However this returns the following error:
Traceback (most recent call last):
File "/Users/Chris/Desktop/gdata_test.py", line 200, in <module>
main()
File "/Users/Chris/Desktop/gdata_test.py", line 196, in main
sample.Run()
File "/Users/Chris/Desktop/gdata_test.py", line 162, in Run
self._PromptForSpreadsheet()
File "/Users/Chris/Desktop/gdata_test.py", line 49, in _PromptForSpreadsheet
feed = self.gd_client.GetSpreadsheetsFeed()
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/gdata/spreadsheet/service.py", line 99, in GetSpreadsheetsFeed
converter=gdata.spreadsheet.SpreadsheetsSpreadsheetsFeedFromString)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/gdata/service.py", line 1074, in Get
return converter(result_body)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/gdata/spreadsheet/__init__.py", line 395, in SpreadsheetsSpreadsheetsFeedFromString
xml_string)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/atom/__init__.py", line 93, in optional_warn_function
return f(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/atom/__init__.py", line 127, in CreateClassFromXMLString
tree = ElementTree.fromstring(xml_string)
File "<string>", line 125, in XML
cElementTree.ParseError: no element found: line 1, column 0
[Finished in 0.3s with exit code 1]
[shell_cmd: python -u "/Users/Chris/Desktop/gdata_test.py"]
[dir: /Users/Chris/Desktop]
[path: /usr/bin:/bin:/usr/sbin:/sbin]
I should also mention that I've been using Gspread as a method to interact with Google Spreadsheets, but when I run the below code, I get the gid, but I need to have the worksheet id.
gc = gspread.authorize(credentials)
sh = gc.open_by_url('google_spreadsheet_url')
sh.get_id_fields()
>> {'spreadsheet_id': '1BgCEn-3Nor7UxOEPwD-qv8qXe7CaveJBrn9_Lcpo4W4','worksheet_id': 'oqitk0d'}
See the self.gd_client.ProgrammaticLogin() call - this is causing the major problem since it uses the "ClientLogin" authorization method which was first deprecated and later removed on April 20, 2015.
I would actually look into the more fresh and actively developed gspread module instead.
Here is a, somewhat insane, example demonstrating how to extract the actual "gid" value for a given spreadsheet and worksheet name. Note that you would first need to generate the JSON file with the OAuth credentials (I'm assuming you've already done that).
The code (added comments that would hopefully help to understand it):
import urlparse
import xml.etree.ElementTree as ET
import gspread
from oauth2client.service_account import ServiceAccountCredentials
SPREADSHEET_NAME = 'My Test Spreadsheet'
WORKSHEET_NAME = "Sheet2"
PATH_TO_JSON_KEYFILE = '/path/to/json/key/file.json'
NAMESPACES = {'ns0': 'http://www.w3.org/2005/Atom'}
SCOPES = ['https://spreadsheets.google.com/feeds']
# log in
credentials = ServiceAccountCredentials.from_json_keyfile_name(PATH_TO_JSON_KEYFILE, SCOPES)
gss_client = gspread.authorize(credentials)
# open spreadsheet
gss = gss_client.open(SPREADSHEET_NAME)
# extract the full feed url
root = gss._feed_entry
full_feed_url = next(elm.attrib["href"] for elm in root.findall("ns0:link", namespaces=NAMESPACES) if "full" in elm.attrib["href"])
# get the feed and extract the gid value for a given sheet name
response = gss_client.session.get(full_feed_url)
root = ET.fromstring(response.content)
sheet_entry = next(elm for elm in root.findall("ns0:entry", namespaces=NAMESPACES)
if elm.find("ns0:title", namespaces=NAMESPACES).text == WORKSHEET_NAME)
link = next(elm.attrib["href"] for elm in sheet_entry.findall("ns0:link", namespaces=NAMESPACES)
if "gid=" in elm.attrib["href"])
# extract "gid" from URL
gid = urlparse.parse_qs(urlparse.urlparse(link).query)["gid"][0]
print(gid)
It also looks like there is a way to convert the worksheet ID to a gid value, see:
How to convert Google spreadsheet's worksheet string id to integer index (GID)?
Jan 2017
You can use the new google spreadsheet api v4. You could take look at pygsheets library which uses api v4.
import pygsheets
#authorize the pygsheets
gc = pygsheets.authorize()
#open the spreadsheet
sh = gc.open('my new ssheet')
# get the worksheet and its id
print sh.worksheet_by_title("my test sheet").id
this seems to work for me using gspread
given a spreadsheet's worksheet url named 'mysheet1' that looks like this:
https://docs.google.com/spreadsheets/d/xxxxxf435454xxkjkjk23232325/edit#gid=645031900
this could be use to retrieve the gid value (aka: worksheet id or sheetid)
ss_key = xxxxxf435454xxkjkjk23232325
wks_name = mysheet1
gc.open_by_key('xxxxxf435454xxkjkjk23232325').worksheet('mysheet1').id
result:
645031900