How to conect data scraping with google sheets - python

i have this code from a web scraping
from smart_sensor_client.smart_sensor_client import SmartSensorClient
import json
import requests
import pprint
DEFAULT_SETTINGS_FILE = 'settings.yaml'
def run_task(settings_file=DEFAULT_SETTINGS_FILE) -> bool:
# Create the client instance
client = SmartSensorClient(settings_file=settings_file)
# Authenticate
if not client.authenticate():
print('Authentication FAILED')
return False
# Get list of plants
plants = client.get_plant_list()
# Iterate the plant list and print all assets therein
for plant in plants:
# Get list of assets
response = client.get_asset_list(organization_id=client.organization_id)
if len(response) == 0:
print('No assets in this plant')
else:
for asset in response:
print(asset["assetName"],':',asset["lastSyncTimeStamp"])
...
And the following answer. json that i filter with the information i am interested in.
Ventilador FCH 11 : 2020-06-11T09:48:45Z
VTL SVA1 : 2020-06-11T10:20:43Z
Dardelet PV-1 : 2020-06-11T09:58:14Z
CANDLOT 1 (MOTOR N°2) : 2020-06-11T10:37:39Z
PC N°1 S1/2A : 2020-06-11T10:57:34Z
VTL SVA2 : 2020-06-11T11:31:08Z
Ventilador FCH 6 : 2020-06-11T11:43:28Z
Vibrotamiz Tolva Tampon : 2020-06-11T11:44:43Z
Ventilador FCH 10 : 2020-06-11T11:08:03Z
Task SUCCESS
I would like to paste this on a google sheet but i dont know how if its possible. Thank you!

Related

Running Google Cloud DocumentAI sample code on Python returned the error 503

I am trying the example from the Google repo:
https://github.com/googleapis/python-documentai/blob/HEAD/samples/snippets/quickstart_sample.py
I have an error:
metadata=[('x-goog-request-params', 'name=projects/my_proj_id/locations/us/processors/my_processor_id'), ('x-goog-api-client', 'gl-python/3.8.10 grpc/1.38.1 gax/1.30.0 gapic/1.0.0')]), last exception: 503 DNS resolution failed for service: https://us-documentai.googleapis.com/v1/
My full code:
from google.cloud import documentai_v1 as documentai
import os
# TODO(developer): Uncomment these variables before running the sample.
project_id= '123456789'
location = 'us' # Format is 'us' or 'eu'
processor_id = '1a23345gh823892' # Create processor in Cloud Console
file_path = 'document.jpg'
os.environ['GRPC_DNS_RESOLVER'] = 'native'
def quickstart(project_id: str, location: str, processor_id: str, file_path: str):
# You must set the api_endpoint if you use a location other than 'us', e.g.:
opts = {}
if location == "eu":
opts = {"api_endpoint": "eu-documentai.googleapis.com"}
client = documentai.DocumentProcessorServiceClient(client_options=opts)
# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}:process"
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
document = {"content": image_content, "mime_type": "image/jpeg"}
# Configure the process request
request = {"name": name, "raw_document": document}
result = client.process_document(request=request)
document = result.document
document_pages = document.pages
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
# Read the text recognition output from the processor
print("The document contains the following paragraphs:")
for page in document_pages:
paragraphs = page.paragraphs
for paragraph in paragraphs:
print(paragraph)
paragraph_text = get_text(paragraph.layout, document)
print(f"Paragraph text: {paragraph_text}")
def get_text(doc_element: dict, document: dict):
"""
Document AI identifies form fields by their offsets
in document text. This function converts offsets
to text snippets.
"""
response = ""
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in doc_element.text_anchor.text_segments:
start_index = (
int(segment.start_index)
if segment in doc_element.text_anchor.text_segments
else 0
)
end_index = int(segment.end_index)
response += document.text[start_index:end_index]
return response
def main ():
quickstart (project_id = project_id, location = location, processor_id = processor_id, file_path = file_path)
if __name__ == '__main__':
main ()
FYI, on the Google Cloud website it stated that the endpoint is:
https://us-documentai.googleapis.com/v1/projects/123456789/locations/us/processors/1a23345gh823892:process
I can use the web interface to run DocumentAI so it is working. I just have the problem with Python code.
Any suggestion is appreciated.
I would suspect the GRPC_DNS_RESOLVER environment variable to be the root cause. Did you try with the corresponding line commented out? Why was it added in your code?

How to get all customers data from Shopify Python API?

For a private Shopify app, I want to retrieve all the customers data and write into a csv file. I have tried the option below for getting page-wise 250 records at a time. But I am getting an error:
HTTPError: Bad Request
shopify.ShopifyResource.set_site(shop_url)
import sys
import pandas as pd
% Get all customers
def get_all_resources(resource, **kwargs):
resource_count = resource.count(**kwargs)
resources = []
if resource_count > 0:
for page in range(1, ((resource_count-1) // 250) + 2):
kwargs.update({"limit" : 250, "page" : page})
resources.extend(resource.find(**kwargs))
return resources
all_customers = get_all_resources(shopify.Customer)
data=[]
for customer in all_customers:
tempdata=[]
tempdata.append(customer.id)
tempdata.append(customer.first_name)
tempdata.append(customer.last_name)
tempdata.append(customer.addresses)
tempdata.append(customer.phone)
tempdata.append(customer.email)
data.append(tempdata)
df=pd.DataFrame(data,columns=['CustomerCode','FirstName','LastName','Address','MobileNo','Email'])
df.to_csv('CustomerDataFromServer.csv',index=False)
shopify.ShopifyResource.clear_session()
You cannot use page-based pagination anymore.
Use cursor-based pagination instead.

How to retrieve large amounts of data (5000+ videos) from YouTube Data API v3?

My goal is to extract all videos from a playlist which can have many videos, ~3000 and can have more than 5000 videos. With maxResults=50 and after implementing pagination with nextPageToken, I'm only able to call the API 20 times, after which nextPageToken isn't sent with the response
I'm calling the API from a python application. I have a while loop running till nextPageToken isn't sent, ideally this should happen AFTER all the videos are extracted, but it prematurely exits after calling the API 19-20 times
def main():
youtube = get_authorised_youtube() # returns YouTube resource authorized with OAuth.
first_response = make_single_request(youtube, None) # make_single_request() takes in the youtube resource and nextPageToken, if any.
nextPageToken = first_response["nextPageToken"]
try:
count = 0
while True:
response = make_single_request(youtube, nextPageToken)
nextPageToken = response["nextPageToken"]
count += 1
print(count, end=" ")
print(nextPageToken)
except KeyError as e: # KeyError to catch if nextPageToken wasn't present
response.pop("items")
print(response) # prints the last response for analysis
if __name__ == '__main__':
main()
snippet of make_single_request():
def make_single_request(youtube, nextPageToken):
if nextPageToken is None:
request = youtube.videos().list(
part="id",
myRating="like",
maxResults=50
)
else:
request = youtube.videos().list(
part="id",
myRating="like",
pageToken=nextPageToken,
maxResults=50
)
response = request.execute()
return response
Expected the code to make upwards of 50 API calls but is observed to only make around 20 calls, consistently.
Note: The following code was executed with an unpaid GCP account. The calls made has part="id" which has a quota cost of 0. The calls limit according to GCP is: 10,000. According to the quota on the console, I make only 20.
Output:
1 CGQQAA
2 CJYBEAA
3 CMgBEAA
4 CPoBEAA
5 CKwCEAA
6 CN4CEAA
7 CJADEAA
8 CMIDEAA
9 CPQDEAA
10 CKYEEAA
11 CNgEEAA
12 CIoFEAA
13 CLwFEAA
14 CO4FEAA
15 CKAGEAA
16 CNIGEAA
17 CIQHEAA
18 CLYHEAA
19 {'kind': 'youtube#videoListResponse', 'etag': '"ETAG"', 'prevPageToken': 'CLYHEAE', 'pageInfo': {'totalResults': TOTAL_RESULTS(>4000), 'resultsPerPage': 50}}
EDIT: After changing maxResults=20, It is observed that the code makes around 50 API calls, therefore the total number of videos that can be extracted is a constant at 1000.
For obtaining the entire list of liked videos of a given channel without any omissions, I suggest you to use PlaylistItems endpoint instead, queried for the given channel's liked-videos playlist by passing a proper value to the endpoint's playlistId parameter.
A given channel's liked-videos playlist ID is obtained upon querying the channel's own endpoint. The needed ID is to be found at .items.contentDetails.relatedPlaylists.likes.
if the goal is to retrieve the FULL list of liked videos in a tideous but working way you can checkout this question.
you basically scrape the data of a deeplink page...
and whats not mentioned in this post is that after you have retrieved the video ids and you may want more data, you can use the videos endpoint with a list of comma seperated video ids to get more informations.
if you need inspirations for the script this is an adjusted version of the api scripts that are provided by youtube
just adjust the credentials file path and the input path of the file thats been retrieved by doing the webscrape
import os
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
import json
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
def do_request(youtube, video_ids):
#https://developers.google.com/youtube/v3/docs/videos/list
request = youtube.videos().list(
part='contentDetails,id,snippet,statistics',
id=','.join(video_ids),
maxResults=50
)
return request.execute()["items"]
def main(video_ids):
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
client_secrets_file = "INPUTAPICREDFILEHERE./creds.json"
# Get credentials and create an API client
flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
client_secrets_file, scopes)
credentials = flow.run_console()
youtube = googleapiclient.discovery.build(
api_service_name, api_version, credentials=credentials)
data = { 'items': [] }
current_id_batch = []
for id in video_ids:
if len(current_id_batch) == 50:
print(f"Fetching.. current batch {len(data['items'])} of {len(video_ids)}")
result = do_request(youtube, current_id_batch)
data['items'].extend(result)
current_id_batch = []
current_id_batch.append(id)
result = do_request(youtube, current_id_batch)
data['items'].extend(result)
with open('./data.json', 'w') as outfile:
outfile.write(json.dumps(data, indent=4))
if __name__ == "__main__":
liked_vids = {}
f = open('PATHTOLIKEDVIDEOS/liked_videos.json', encoding="utf8")
liked_vids = json.load(f)
main(list(liked_vids.keys()))
Try to wait some time in a such way:
import time
time.sleep(1) # time here in seconds

Facebook Marketing API - Python to get Insights - User Request Limit Reached

So I am trying my best to navigate my way through the Facebook API. I need to crate a script that will download my business' campaign information daily as a csv file so I can use another script to upload the information to our database easily.
I finally have code that works to print the information to the log, but I am reaching the user request limit because I have to call get_insights() for every single campaign individually. I am wondering if anyone knows how to help me make it so I don't have to call the facebook API as often.
What I would like to do if find a field where I can get the daily spend so I don't have to call the API in every iteration of my for campaign loop, but I cannot for the life of me find a way to do so.
#Import all the facebook mumbo jumbo
from facebookads.api import FacebookAdsApi
from facebookads.adobjects.adset import AdSet
from facebookads.adobjects.campaign import Campaign
from facebookads.adobjects.adsinsights import AdsInsights
from facebookads.adobjects.adreportrun import AdReportRun
from facebookads.adobjects.adaccount import AdAccount
from facebookads.adobjects.business import Business
import time
#Set the login info
my_app_id = '****'
my_app_secret = '****'
my_access_token = '****'
#Start the connection to the facebook API
FacebookAdsApi.init(my_app_id, my_app_secret, my_access_token)
business = Business('****')
#Get all ad accounts on the business account
accounts = business.get_owned_ad_accounts(fields=[AdAccount.Field.id])
#iterate through all accounts in the business account
for account in accounts:
tempaccount = AdAccount(account[AdAccount.Field.id])
#get all campaigns in the adaccount
campaigns = tempaccount.get_campaigns(fields=[Campaign.Field.name,Campaign.Field])
#iterate trough all the campaigns in the adaccount
for campaign in campaigns:
print(campaign[Campaign.Field.name])
#get the insight info (spend) from each campaign
campaignsights = campaign.get_insights(params={'date_preset':'yesterday'},fields=[AdsInsights.Field.spend])
print (campaignsights)
It took a while of digging through the API and guessing but I got it! Here is my final script:
# This program downloads all relevent Facebook traffic info as a csv file
# This program requires info from the Facebook Ads API: https://github.com/facebook/facebook-python-ads-sdk
# Import all the facebook mumbo jumbo
from facebookads.api import FacebookAdsApi
from facebookads.adobjects.adsinsights import AdsInsights
from facebookads.adobjects.adaccount import AdAccount
from facebookads.adobjects.business import Business
# Import th csv writer and the date/time function
import datetime
import csv
# Set the info to get connected to the API. Do NOT share this info
my_app_id = '****'
my_app_secret = '****'
my_access_token = '****'
# Start the connection to the facebook API
FacebookAdsApi.init(my_app_id, my_app_secret, my_access_token)
# Create a business object for the business account
business = Business('****')
# Get yesterday's date for the filename, and the csv data
yesterdaybad = datetime.datetime.now() - datetime.timedelta(days=1)
yesterdayslash = yesterdaybad.strftime('%m/%d/%Y')
yesterdayhyphen = yesterdaybad.strftime('%m-%d-%Y')
# Define the destination filename
filename = yesterdayhyphen + '_fb.csv'
filelocation = "/cron/downloads/"+ filename
# Get all ad accounts on the business account
accounts = business.get_owned_ad_accounts(fields=[AdAccount.Field.id])
# Open or create new file
try:
csvfile = open(filelocation , 'w+', 0777)
except:
print ("Cannot open file.")
# To keep track of rows added to file
rows = 0
try:
# Create file writer
filewriter = csv.writer(csvfile, delimiter=',')
except Exception as err:
print(err)
# Iterate through the adaccounts
for account in accounts:
# Create an addaccount object from the adaccount id to make it possible to get insights
tempaccount = AdAccount(account[AdAccount.Field.id])
# Grab insight info for all ads in the adaccount
ads = tempaccount.get_insights(params={'date_preset':'yesterday',
'level':'ad'
},
fields=[AdsInsights.Field.account_id,
AdsInsights.Field.account_name,
AdsInsights.Field.ad_id,
AdsInsights.Field.ad_name,
AdsInsights.Field.adset_id,
AdsInsights.Field.adset_name,
AdsInsights.Field.campaign_id,
AdsInsights.Field.campaign_name,
AdsInsights.Field.cost_per_outbound_click,
AdsInsights.Field.outbound_clicks,
AdsInsights.Field.spend
]
);
# Iterate through all accounts in the business account
for ad in ads:
# Set default values in case the insight info is empty
date = yesterdayslash
accountid = ad[AdsInsights.Field.account_id]
accountname = ""
adid = ""
adname = ""
adsetid = ""
adsetname = ""
campaignid = ""
campaignname = ""
costperoutboundclick = ""
outboundclicks = ""
spend = ""
# Set values from insight data
if ('account_id' in ad) :
accountid = ad[AdsInsights.Field.account_id]
if ('account_name' in ad) :
accountname = ad[AdsInsights.Field.account_name]
if ('ad_id' in ad) :
adid = ad[AdsInsights.Field.ad_id]
if ('ad_name' in ad) :
adname = ad[AdsInsights.Field.ad_name]
if ('adset_id' in ad) :
adsetid = ad[AdsInsights.Field.adset_id]
if ('adset_name' in ad) :
adsetname = ad[AdsInsights.Field.adset_name]
if ('campaign_id' in ad) :
campaignid = ad[AdsInsights.Field.campaign_id]
if ('campaign_name' in ad) :
campaignname = ad[AdsInsights.Field.campaign_name]
if ('cost_per_outbound_click' in ad) : # This is stored strangely, takes a few steps to break through the layers
costperoutboundclicklist = ad[AdsInsights.Field.cost_per_outbound_click]
costperoutboundclickdict = costperoutboundclicklist[0]
costperoutboundclick = costperoutboundclickdict.get('value')
if ('outbound_clicks' in ad) : # This is stored strangely, takes a few steps to break through the layers
outboundclickslist = ad[AdsInsights.Field.outbound_clicks]
outboundclicksdict = outboundclickslist[0]
outboundclicks = outboundclicksdict.get('value')
if ('spend' in ad) :
spend = ad[AdsInsights.Field.spend]
# Write all ad info to the file, and increment the number of rows that will display
filewriter.writerow([date, accountid, accountname, adid, adname, adsetid, adsetname, campaignid, campaignname, costperoutboundclick, outboundclicks, spend])
rows += 1
csvfile.close()
# Print report
print (str(rows) + " rows added to the file " + filename)
I then have a php script that takes the csv file and uploads it to my database. The key is pulling all the insight data in one big yank. You can then break it up however you want because each ad has information about its adset, adaccount, and campaign.
Adding a couple of small functions to improve on LucyTurtle's answer as it is still susceptible to Facebook's Rate Limiting
import logging
import requests as rq
#Function to find the string between two strings or characters
def find_between( s, first, last ):
try:
start = s.index( first ) + len( first )
end = s.index( last, start )
return s[start:end]
except ValueError:
return ""
#Function to check how close you are to the FB Rate Limit
def check_limit():
def check_limit():
check=rq.get('https://graph.facebook.com/v3.3/act_'+account_number+'/insights?access_token='+my_access_token)
call=float(find_between(check.headers['x-business-use-case-usage'],'call_count":','}'))
cpu=float(find_between(check.headers['x-business-use-case-usage'],'total_cputime":','}'))
total=float(find_between(check.headers['x-business-use-case-usage'],'total_time":',','))
usage=max(call,cpu,total)
return usage
#Check if you reached 75% of the limit, if yes then back-off for 5 minutes (put this chunk in your 'for ad is ads' loop, every 100-200 iterations)
if (check_limit()>75):
print('75% Rate Limit Reached. Cooling Time 5 Minutes.')
logging.debug('75% Rate Limit Reached. Cooling Time 5 Minutes.')
time.sleep(300)
I'd just like to say
Thank you.
As Marks Andre said - you made my day!
The FB SDK documentation is exhaustive, but it completely lacks the practical implementation examples for day-to-day-tasks like this one. Bookmark is set - page will be revisited soon.
So the only thing I can actually contribute for fellow sufferers: it seems that with the newer facebook_business SDK you can simply completely replace "facebookads" in the import statements with "facebook_business".

How to parse a single-column text file into a table using python?

I'm new here to StackOverflow, but I have found a LOT of answers on this site. I'm also a programming newbie, so i figured i'd join and finally become part of this community - starting with a question about a problem that's been plaguing me for hours.
I login to a website and scrape a big body of text within the b tag to be converted into a proper table. The layout of the resulting Output.txt looks like this:
BIN STATUS
8FHA9D8H 82HG9F RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
INVENTORY CODE: FPBC *SOUP CANS LENTILS
BIN STATUS
HA8DHW2H HD0138 RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8SHDNADU 00A123 #2956- INVALID STOCK COUPON CODE (MISSING).
93827548 096DBR RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
There are a bunch of pages with the exact same blocks, but i need them to be combined into an ACTUAL table that looks like this:
BIN INV CODE STATUS
HA8DHW2HHD0138 FPBC-*SOUP CANS LENTILS RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8SHDNADU00A123 FPBC-*SOUP CANS LENTILS #2956- INVALID STOCK COUPON CODE (MISSING).
93827548096DBR FPBC-*SOUP CANS LENTILS RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8FHA9D8H82HG9F SSXR-98-20LM NM CORN CREAM RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
Essentially, all separate text blocks in this example would become part of this table, with the inv code repeating with its Bin values. I would post my attempts at parsing this data(have tried Pandas/bs/openpyxl/csv writer), but ill admit they are a little embarrassing, as i cannot find any information on this specific problem. Is there any benevolent soul out there that can help me out? :)
(Also, i am using Python 2.7)
A simple custom parser like the following should do the trick.
from __future__ import print_function
def parse_body(s):
line_sep = '\n'
getting_bins = False
inv_code = ''
for l in s.split(line_sep):
if l.startswith('INVENTORY CODE:') and not getting_bins:
inv_data = l.split()
inv_code = inv_data[2] + '-' + ' '.join(inv_data[3:])
elif l.startswith('INVENTORY CODE:') and getting_bins:
print("unexpected inventory code while reading bins:", l)
elif l.startswith('BIN') and l.endswith('MESSAGE'):
getting_bins = True
elif getting_bins == True and l:
bin_data = l.split()
# need to add exception handling here to make sure:
# 1) we have an inv_code
# 2) bin_data is at least 3 items big (assuming two for
# bin_id and at least one for message)
# 3) maybe some constraint checking to ensure that we have
# a valid instance of an inventory code and bin id
bin_id = ''.join(bin_data[0:2])
message = ' '.join(bin_data[2:])
# we now have a bin, an inv_code, and a message to add to our table
print(bin_id.ljust(20), inv_code.ljust(30), message, sep='\t')
elif getting_bins == True and not l:
# done getting bins for current inventory code
getting_bins = False
inv_code = ''
A rather complex one, but this might get you started:
import re, pandas as pd
from pandas import DataFrame
rx = re.compile(r'''
(?:INVENTORY\ CODE:)\s*
(?P<inv>.+\S)
[\s\S]+?
^BIN.+[\n\r]
(?P<bin_msg>(?:(?!^\ ).+[\n\r])+)
''', re.MULTILINE | re.VERBOSE)
string = your_string_here
# set up the dataframe
df = DataFrame(columns = ['BIN', 'INV', 'MESSAGE'])
for match in rx.finditer(string):
inv = match.group('inv')
bin_msg_raw = match.group('bin_msg').split("\n")
rxbinmsg = re.compile(r'^(?P<bin>(?:(?!\ {2}).)+)\s+(?P<message>.+\S)\s*$', re.MULTILINE)
for item in bin_msg_raw:
for m in rxbinmsg.finditer(item):
# append it to the dataframe
df.loc[len(df.index)] = [m.group('bin'), inv, m.group('message')]
print(df)
Explanation
It looks for INVENTORY CODE and sets up the groups (inv and bin_msg) for further processing in afterwork() (note: it would be easier if you had only one line of bin/msg as you need to split the group here afterwards).
Afterwards, it splits the bin and msg part and appends all to the df object.
I had a code written for a website scrapping which may help you.
Basically what you need to do is write click on the web page go to html and try to find the tag for the table you are looking for and using the module (i am using beautiful soup) extract the information. I am creating a json as I need to store it into mongodb you can create table.
#! /usr/bin/python
import sys
import requests
import re
from BeautifulSoup import BeautifulSoup
import pymongo
def req_and_parsing():
url2 = 'http://businfo.dimts.in/businfo/Bus_info/EtaByRoute.aspx?ID='
list1 = ['534UP','534DOWN']
for Route in list1:
final_url = url2 + Route
#r = requests.get(final_url)
#parsing_file(r.text,Route)
outdict = []
outdict = [parsing_file( requests.get(url2+Route).text,Route) for Route in list1 ]
print outdict
conn = f_connection()
for i in range(len(outdict)):
insert_records(conn,outdict[i])
def parsing_file(txt,Route):
soup = BeautifulSoup(txt)
table = soup.findAll("table",{"id" : "ctl00_ContentPlaceHolder1_GridView2"})
#trtags = table[0].findAll('tr')
tdlist = []
trtddict = {}
"""
for trtag in trtags:
print 'print trtag- ' , trtag.text
tdtags = trtag.findAll('td')
for tdtag in tdtags:
print tdtag.text
"""
divtags = soup.findAll("span",{"id":"ctl00_ContentPlaceHolder1_ErrorLabel"})
for divtag in divtags:
for divtag in divtags:
print "div tag - " , divtag.text
if divtag.text == "Currently no bus is running on this route" or "This is not a cluster (orange bus) route":
print "Page not displayed Errored with below meeeage for Route-", Route," , " , divtag.text
sys.exit()
trtags = table[0].findAll('tr')
for trtag in trtags:
tdtags = trtag.findAll('td')
if len(tdtags) == 2:
trtddict[tdtags[0].text] = sub_colon(tdtags[1].text)
return trtddict
def sub_colon(tag_str):
return re.sub(';',',',tag_str)
def f_connection():
try:
conn=pymongo.MongoClient()
print "Connected successfully!!!"
except pymongo.errors.ConnectionFailure, e:
print "Could not connect to MongoDB: %s" % e
return conn
def insert_records(conn,stop_dict):
db = conn.test
print db.collection_names()
mycoll = db.stopsETA
mycoll.insert(stop_dict)
if __name__ == "__main__":
req_and_parsing()

Categories

Resources