Extracting JavaScript Variables into Python Dictionaries - python

Understanding that I have to use PyQt5 in conjunction with BeautifulSoup to run javascript on my client after extracting the html using BeautifulSoup, I am trying to convert variable _Flourish_data into a Python dictionary.
Is there an easy way to extract the Javascript variable, _Flourish_data, into a Python dictionary? Here is my current Python to extract the Javascript using PyQt5 and BeautifulSoup:
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
def Callable(self, html_str):
self.html = html_str
self.app.quit()
page = Page('https://flo.uri.sh/visualisation/2451841/embed?auto=1')
soup = bs.BeautifulSoup(page.html, 'html.parser')
js_test = soup.find_all('script')
js_test[5]
The output of the existing code is
<script>
function _Flourish_unflattenInto(dest, src) {
dest = dest || {};
for (var k in src) {
var t = dest;
for (var i = k.indexOf("."), p = 0; i >= 0; i = k.indexOf(".", p = i+1)) {
var s = k.substring(p, i);
if (!(s in t)) t[s] = {};
t = t[s];
}
t[k.substring(p)] = src[k];
}
return dest;
}
var _Flourish_settings = {"cell_fill_1":"#ffffff","cell_fill_2":"#ebebeb","cell_fill_direction":"horizontal","cell_font_size":"1","cell_height":20,"cell_horizontal_alignment":"center","cell_link_color":"#2886b2","cell_padding_horizontal":16,"cell_padding_vertical":11,"column_width_mode":"auto","column_widths":"10%, 10%, 10%, 10%, 50%, 10%","header_fill":"#181f6c","header_font_color":"#ffffff","header_font_default":false,"header_font_size":1.1,"header_horizontal_alignment":"center","header_style_default":true,"layout.body_font":{"name":"Source Sans Pro","url":"https://fonts.googleapis.com/css?family=Source+Sans+Pro:400,700"},"layout.layout_order":"stack-default","layout.space_between_sections":"0.5","mobile.view":true,"no_results_text":"Use the search bar to find your state","pagination_amount":41,"pagination_amount_search":"5","search_enabled":false,"search_hide_table":false,"search_placeholder":"Search to find your state","search_resize":true,"search_width":15};
_Flourish_unflattenInto(window.template.state, _Flourish_settings);
var _Flourish_data_column_names = {"rows":{"columns":["State ","Earliest/Planned Start Date for 20/21 Academic Year ","","","",""]}},
_Flourish_data = {"rows":[{"columns":["Alabama","Varies by district","","","",""]},{"columns":["Alaska","Varies by district","","","",""]},{"columns":["American Samoa","Unknown","","","",""]},{"columns":["Arizona","Varies by district","","","",""]},{"columns":["Arkansas","Varies by district","","","",""]},{"columns":["Bureau of Indian Education","Varies by district","","","",""]},{"columns":["California","Varies by district","","","",""]},{"columns":["Colorado","Varies by district","","","",""]},{"columns":["Connecticut","Not yet determined","","","",""]},{"columns":["Delaware","Varies by district","","","",""]},{"columns":["Department of Defense Education Activity\n ","Varies by district","","","",""]},{"columns":["District of Columbia","8/31/2020","","","",""]},{"columns":["Florida","Unknown","","","",""]},{"columns":["Georgia","Unknown","","","",""]},{"columns":["Guam","Unknown","","","",""]},{"columns":["Hawaii","Not yet determined","","","",""]},{"columns":["Idaho","Varies by District","","","",""]},{"columns":["Illinois","Varies by district","","","",""]},{"columns":["Indiana","Not yet determined","","","",""]},{"columns":["Iowa","Varies by district","","","",""]},{"columns":["Kansas","Not yet determined","","","",""]},{"columns":["Kentucky","Unknown","","","",""]},{"columns":["Louisiana","Varies by district","","","",""]},{"columns":["Maine","Varies by district","","","",""]},{"columns":["Maryland","Not yet determined","","","",""]},{"columns":["Massachusetts","Not yet determined","","","",""]},{"columns":["Michigan","Not yet determined","","","",""]},{"columns":["Minnesota","Not yet determined","","","",""]},{"columns":["Mississippi ","Varies by district","","","",""]},{"columns":["Missouri","Varies by district","","","",""]},{"columns":["Montana","Varies by district","","","",""]},{"columns":["Nebraska","Varies by district","","","",""]},{"columns":["Nevada","Varies by district","","","",""]},{"columns":["New Hampshire","Not yet determined","","","",""]},{"columns":["New Jersey","Varies by district","","","",""]},{"columns":["New Mexico","Unknown","","","",""]},{"columns":["New York","Not yet determined","","","",""]},{"columns":["North Carolina","8/17/2020","","","",""]},{"columns":["North Dakota","Varies by district","","","",""]},{"columns":["Northern Marianas","Unknown","","","",""]},{"columns":["Ohio","Not yet determined","","","",""]},{"columns":["Oklahoma","Varies by district","","","",""]},{"columns":["Oregon","Not yet determined","","","",""]},{"columns":["Pennsylvania","Varies by district","","","",""]},{"columns":["Puerto Rico","Unknown","","","",""]},{"columns":["Rhode Island","Not yet determined","","","",""]},{"columns":["South Carolina","Not yet determined","","","",""]},{"columns":["South Dakota","Varies by district","","","",""]},{"columns":["Tennessee","Varies by district","","","",""]},{"columns":["Texas","Varies by district","","","",""]},{"columns":["U.S. Virgin Islands\n ","Not yet determined","","","",""]},{"columns":["Utah","Varies by district","","","",""]},{"columns":["Vermont","Not yet determined","","","",""]},{"columns":["Virginia","Not yet determined","","","",""]},{"columns":["Washington","Varies by District","","","",""]},{"columns":["West Virginia","Not yet determined","","","",""]},{"columns":["Wisconsin","Varies by district","","","",""]},{"columns":["Wyoming","Not yet determined","","","",""]}]};
for (var _Flourish_dataset in _Flourish_data) {
window.template.data[_Flourish_dataset] = _Flourish_data[_Flourish_dataset];
window.template.data[_Flourish_dataset].column_names = _Flourish_data_column_names[_Flourish_dataset];
}
window.template.draw();
</script>
I just want var _flourish_data from HTML tag, as shown below:
_Flourish_data = {"rows":[{"columns":["Alabama","Varies by district","","","",""]},{"columns":["Alaska","Varies by district","","","",""]},{"columns":["American Samoa","Unknown","","","",""]},{"columns":["Arizona","Varies by district","","","",""]},{"columns":["Arkansas","Varies by district","","","",""]},{"columns":["Bureau of Indian Education","Varies by district","","","",""]},{"columns":["California","Varies by district","","","",""]},{"columns":["Colorado","Varies by district","","","",""]},{"columns":["Connecticut","Not yet determined","","","",""]},{"columns":["Delaware","Varies by district","","","",""]},{"columns":["Department of Defense Education Activity\n ","Varies by district","","","",""]},{"columns":["District of Columbia","8/31/2020","","","",""]},{"columns":["Florida","Unknown","","","",""]},{"columns":["Georgia","Unknown","","","",""]},{"columns":["Guam","Unknown","","","",""]},{"columns":["Hawaii","Not yet determined","","","",""]},{"columns":["Idaho","Varies by District","","","",""]},{"columns":["Illinois","Varies by district","","","",""]},{"columns":["Indiana","Not yet determined","","","",""]},{"columns":["Iowa","Varies by district","","","",""]},{"columns":["Kansas","Not yet determined","","","",""]},{"columns":["Kentucky","Unknown","","","",""]},{"columns":["Louisiana","Varies by district","","","",""]},{"columns":["Maine","Varies by district","","","",""]},{"columns":["Maryland","Not yet determined","","","",""]},{"columns":["Massachusetts","Not yet determined","","","",""]},{"columns":["Michigan","Not yet determined","","","",""]},{"columns":["Minnesota","Not yet determined","","","",""]},{"columns":["Mississippi ","Varies by district","","","",""]},{"columns":["Missouri","Varies by district","","","",""]},{"columns":["Montana","Varies by district","","","",""]},{"columns":["Nebraska","Varies by district","","","",""]},{"columns":["Nevada","Varies by district","","","",""]},{"columns":["New Hampshire","Not yet determined","","","",""]},{"columns":["New Jersey","Varies by district","","","",""]},{"columns":["New Mexico","Unknown","","","",""]},{"columns":["New York","Not yet determined","","","",""]},{"columns":["North Carolina","8/17/2020","","","",""]},{"columns":["North Dakota","Varies by district","","","",""]},{"columns":["Northern Marianas","Unknown","","","",""]},{"columns":["Ohio","Not yet determined","","","",""]},{"columns":["Oklahoma","Varies by district","","","",""]},{"columns":["Oregon","Not yet determined","","","",""]},{"columns":["Pennsylvania","Varies by district","","","",""]},{"columns":["Puerto Rico","Unknown","","","",""]},{"columns":["Rhode Island","Not yet determined","","","",""]},{"columns":["South Carolina","Not yet determined","","","",""]},{"columns":["South Dakota","Varies by district","","","",""]},{"columns":["Tennessee","Varies by district","","","",""]},{"columns":["Texas","Varies by district","","","",""]},{"columns":["U.S. Virgin Islands\n ","Not yet determined","","","",""]},{"columns":["Utah","Varies by district","","","",""]},{"columns":["Vermont","Not yet determined","","","",""]},{"columns":["Virginia","Not yet determined","","","",""]},{"columns":["Washington","Varies by District","","","",""]},{"columns":["West Virginia","Not yet determined","","","",""]},{"columns":["Wisconsin","Varies by district","","","",""]},{"columns":["Wyoming","Not yet determined","","","",""]}]};
Any help would be greatly appreciated!

You don't need to execute Javascript. It can be done with json and re module.
For example:
import re
import json
import requests
url = 'https://flo.uri.sh/visualisation/2451841/embed?auto=1'
html_data = requests.get(url).text
data = re.search(r'_Flourish_data = (\{.*?\});', html_data).group(1)
data = json.loads(data)
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
for row in data['rows']:
print('{:<55}{}'.format(*map(str.strip, row['columns'][:2])))
Prints:
Alabama Varies by district
Alaska Varies by district
American Samoa Unknown
Arizona Varies by district
Arkansas Varies by district
Bureau of Indian Education Varies by district
California Varies by district
Colorado Varies by district
Connecticut Not yet determined
Delaware Varies by district
Department of Defense Education Activity Varies by district
District of Columbia 8/31/2020
Florida Unknown
Georgia Unknown
Guam Unknown
Hawaii Not yet determined
Idaho Varies by District
Illinois Varies by district
Indiana Not yet determined
Iowa Varies by district
Kansas Not yet determined
Kentucky Unknown
Louisiana Varies by district
Maine Varies by district
Maryland Not yet determined
Massachusetts Not yet determined
Michigan Not yet determined
Minnesota Not yet determined
Mississippi Varies by district
Missouri Varies by district
Montana Varies by district
Nebraska Varies by district
Nevada Varies by district
New Hampshire Not yet determined
New Jersey Varies by district
New Mexico Unknown
New York Not yet determined
North Carolina 8/17/2020
North Dakota Varies by district
Northern Marianas Unknown
Ohio Not yet determined
Oklahoma Varies by district
Oregon Not yet determined
Pennsylvania Varies by district
Puerto Rico Unknown
Rhode Island Not yet determined
South Carolina Not yet determined
South Dakota Varies by district
Tennessee Varies by district
Texas Varies by district
U.S. Virgin Islands Not yet determined
Utah Varies by district
Vermont Not yet determined
Virginia Not yet determined
Washington Varies by District
West Virginia Not yet determined
Wisconsin Varies by district
Wyoming Not yet determined

import requests
import re
import json
def main(url):
r = requests.get(url)
match = json.loads(re.search(r'_Flourish_data = ({.*})', r.text).group(1))
print(match.keys())
main("https://flo.uri.sh/visualisation/2451841/embed?auto=1")

Related

Scraping a HTML site using BeautifulSoup and finding the value of "total_pages" in it

I'm writing a python code that scrapes the following website and looks for the value of "total_pages" in it.
The website is https://www.usnews.com/best-colleges/fl
When I open the website in a browser and investigate the source code, the value of "total_pages" is 8. I want my python code to be able to get the same value.
I have written the following code:
import requests
from bs4 import BeautifulSoup
headers ={'User-Agent': 'Mozilla/5.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
main_site=requests.get("https://www.usnews.com/best-colleges/fl",headers=headers)
main_site_content=main_site.content
main_site_content_soup=BeautifulSoup(main_site_content,"html.parser")
But then I get stuck on how to look for the "total_pages" in the parsed data. I have tried find_all() method but no luck. I think I'm not using the method correctly.
One note: the solution does not have to use BeautifulSoup. I just used BeautifulSoup since I was a bit familiar with it.
No need for BeautifulSoup. Here I make a request to their API to get the list of universities.
from rich import print is used to pretty-print the JSON. It should make it easier to read.
Need more help or advice, leave a comment below.
import requests
from rich import print
LINK = "https://www.usnews.com/best-colleges/api/search?format=json&location=Florida&_sort=rank&_sortDirection=asc&_page=1"
def get_data(url):
print("Making request to:", url)
response = requests.get(url, timeout=5, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code == 200:
print("Request Successful!")
data = response.json()["data"]
return data["items"], data["next_link"]
print("Request failed!")
return None, None
def main():
print("Starting Scraping...")
items, next_link = get_data(LINK)
# if there's a `next_link`, scrape it.
while next_link is not None:
print("Getting data from:", next_link)
new_items, next_link = get_data(next_link)
items += new_items
# cleaning the data, for the pandas dataframe.
items = [
{
"name": item["institution"]["displayName"],
"state": item["institution"]["state"],
"city": item["institution"]["city"],
}
for item in items
]
df = pd.DataFrame(items)
print(df.to_markdown())
if __name__ == "__main__":
main()
The output looks like this:
name
state
city
0
University of Florida
FL
Gainesville
1
Florida State University
FL
Tallahassee
2
University of Miami
FL
Coral Gables
3
University of South Florida
FL
Tampa
4
University of Central Florida
FL
Orlando
5
Florida International University
FL
Miami
6
Florida A&M University
FL
Tallahassee
7
Florida Institute of Technology
FL
Melbourne
8
Nova Southeastern University
FL
Ft. Lauderdale
...
...
...
...
74
St. John Vianney College Seminary
FL
Miami
75
St. Petersburg College
FL
St. Petersburg
76
Tallahassee Community College
FL
Tallahassee
77
Valencia College
FL
Orlando

How to scrape information from tables selecting each of the Dropdown options using Selenium and Python?

Trying to help someone who works for a nonprofit. Currently trying to pull info from the STL County Boards/Commissions website(https://boards.stlouisco.com/).
Having trouble for a few reasons:
Was going to attempt to use BeautifulSoup, but the actual data isn't even shown until you choose a Board/Commission from a dropdown bar above, so I have switched to Selenium, which I am new at.
Is this task possible? When I look at the html code for the site, I see that the info isn't stored in the page, but pulled from another location and just displayed on the site based on the option chosen from the dropdown menu.
function ShowMemberList(selectedBoard) {
ClearMeetingsAndMembers();
var htmlString = "";
var boardsList = [{"id":407,"name":"Aging Ahead","isActive":true,"description":"... ...1.","totalSeats":14}];
var totalMembers = boardsList[$("select[name='BoardsList'] option:selected").index() - 1].totalSeats;
$.get("/api/boards/" + selectedBoard + "/members", function (data) {
if (data.length > 0) {
htmlString += "<table id=\"MemberTable\" class=\"table table-hover\">";
htmlString += "<thead><th>Member Name</th><th>Title</th><th>Position</th><th>Expiration Date</th></thead><tbody>";
for (var i = 0; i < totalMembers; i++) {
if (i < data.length) {
htmlString += "<tr><td>" + FormatString(data[i].firstName) + " " + FormatString(data[i].lastName) + "</td><td>" + FormatString(data[i].title) + "</td><td>" + FormatString(data[i].position) + "</td><td>" + FormatString(data[i].expirationDate) + "</td></tr>";
} else {
htmlString += "<tr><td colspan=\"4\">---Vacant Seat---</td></tr>"
}
}
htmlString += "</tbody></table>";
} else {
htmlString = "<span id=\"MemberTable\">There was no data found for this board.</span>";
}
$("#Results").append(htmlString);
});
}
So far, I have this (not a lot), which goes to the page and selects every board from the list:
driver = webdriver.Chrome()
driver.get("https://boards.stlouisco.com/")
select = Select(wait(driver, 10).until(EC.presence_of_element_located((By.ID, 'BoardsList'))))
options = select.options
for board in options:
select.select_by_visible_text(board.text)
From here I would like to be able to scrape the info from the MemberTable but I don't know how to move forward/if it is something in the scope of my abilities, or even if it is something possible with Selenium.
I've tried using find_by a few different elements to click on the members table but am met with errors. I have also tried calling for the memberstable after my select, but it is not able to find that element. Any tips/pointers/advice is appreciated!
You can use this script to save all members from all boards to csv:
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://boards.stlouisco.com/'
members_url = 'https://boards.stlouisco.com/api/boards/{}/members'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
all_data = []
for o in soup.select('#BoardsList option[value]'):
print(o['value'], o.text)
data = requests.get(members_url.format(o['value'])).json()
for d in data:
all_data.append(dict(board=o.text, **d))
df = pd.DataFrame(all_data)
print(df)
df.to_csv('data.csv')
Prints:
board boardMemberId memberId boardName ... lastName title position expirationDate
0 Aging Ahead 39003 27007 None ... Anderson None ST. LOUIS COUNTY EXECUTIVE APPOINTEE 10/1/2020
1 Aging Ahead 38963 27797 None ... Bauers None St. Charles County Community Action Agency App... None
2 Aging Ahead 39004 27815 None ... Berkowitz None ST. LOUIS COUNTY EXECUTIVE APPOINTEE 10/1/2020
3 Aging Ahead 38964 27798 None ... Biehle None Jefferson County Community Action Corp. Appointee None
4 Aging Ahead 38581 27597 None ... Bowers None Franklin County Commission Appointee None
.. ... ... ... ... ... ... ... ... ...
725 Zoo-Museum District - Zoological Park Subdistr... 38863 26745 None ... Seat (Robert R. Hermann, Jr.) St. Louis County 12/31/2019
726 Zoo-Museum District - Zoological Park Subdistr... 38864 26745 None ... Seat (Winthrop Reed) St. Louis County 12/31/2016
727 Zoo-Museum District - Zoological Park Subdistr... 38669 26745 None ... Seat (Lawrence Thomas) St. Louis County 12/31/2018
728 Zoo-Museum District - Zoological Park Subdistr... 38670 26745 None ... Seat (Peggy Ritter ) Advisory Commissioner Non-Voting St. Louis County 12/31/2019
729 Zoo-Museum District - Zoological Park Subdistr... 38394 27512 None ... Wilson Advisory Commissioner Non-Voting City of St. Louis None
[730 rows x 9 columns]
And saves data.csv with all boards/members (screenshot from LibreOffice):
To choose each of the Board / Commission from the html-select Dropdown and scrape the page you have to induce WebDriverWait for the element_to_be_clickable() and you can use the following Locator Strategies:
Code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get("https://boards.stlouisco.com/")
select = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'BoardsList'))))
for option in select.options:
option.click()
print("Scrapping :"+option.text)
Console Output:
Scrapping :---Choose a Board---
Scrapping :Aging Ahead
Scrapping :Aging Ahead Advisory Council
Scrapping :Air Pollution & Noise Control Appeal Board
Scrapping :Animal Care & Control Advisory Board
Scrapping :Bi-State Development Agency (Metro)
Scrapping :Board Of Examiners For Mechanical Licensing
Scrapping :Board of Freeholders
Scrapping :Boundary Commission
Scrapping :Building Code Review Committee
Scrapping :Building Commission & Board Of Building Appeals
Scrapping :Business Advisory Council
Scrapping :Center for Educational Media
Scrapping :Civil Service Commission
Scrapping :Commission On Disabilities
Scrapping :County Health Advisory Board
Scrapping :Domestic And Family Violence Council
Scrapping :East-West Gateway Council of Governments Board of Directors
Scrapping :Economic Development Collaborative Advisory Board
Scrapping :Economic Rescue Team
Scrapping :Electrical Code Review Committee
Scrapping :Electrical Examiners, Board Of
Scrapping :Emergency Communications System Commission
Scrapping :Equalization, Board Of
Scrapping :Fire Standards Commission
Scrapping :Friends of the Kathy J. Weinman Shelter for Battered Women, Inc.
Scrapping :Fund Investment Advisory Committee
Scrapping :Historic Building Commission
Scrapping :Housing Authority
Scrapping :Housing Resources Commission
Scrapping :Human Relations Commission
Scrapping :Industrial Development Authority Board
Scrapping :Justice Services Advisory Board
Scrapping :Lambert Airport Eastern Perimeter Joint Development Commission
Scrapping :Land Clearance For Redevelopment Authority
Scrapping :Lemay Community Improvement District
Scrapping :Library Board
Scrapping :Local Emergency Planning Committee
Scrapping :Mechanical Code Review Committee
Scrapping :Metropolitan Park And Recreation District Board Of Directors (Great Rivers Greenway)
Scrapping :Metropolitan St. Louis Sewer District
Scrapping :Metropolitan Taxicab Commission
Scrapping :Metropolitan Zoological Park and Museum District Board
Scrapping :Municipal Court Judges
Scrapping :Older Adult Commission
Scrapping :Parks And Recreation Advisory Board
Scrapping :Planning Commission
Scrapping :Plumbing Code Review Committee
Scrapping :Plumbing Examiners, Board Of
Scrapping :Police Commissioners, Board Of
Scrapping :Port Authority Board Of Commissioners
Scrapping :Private Security Advisory Committee
Scrapping :Productive Living Board
Scrapping :Public Transportation Commission of St. Louis County
Scrapping :Regional Arts Commission
Scrapping :Regional Convention & Sports Complex Authority
Scrapping :Regional Convention & Visitors Commission
Scrapping :REJIS Commission
Scrapping :Restaurant Commission
Scrapping :Retirement Board Of Trustees
Scrapping :St. Louis Airport Commission
Scrapping :St. Louis County Children's Service Fund Board
Scrapping :St. Louis County Clean Energy Development Board (PACE)
Scrapping :St. Louis County Workforce Development Board
Scrapping :St. Louis Economic Development Partnership
Scrapping :St. Louis Regional Health Commission
Scrapping :St. Louis-Jefferson Solid Waste Management District
Scrapping :Tax Increment Financing Commission of St. Louis County
Scrapping :Transportation Board
Scrapping :Waste Management Commission
Scrapping :World Trade Center - St. Louis
Scrapping :Zoning Adjustment, Board of
Scrapping :Zoo-Museum District - Art Museum Subdistrict Board of Commissioners
Scrapping :Zoo-Museum District - Botanical Garden Subdistrict Board of Commissioners
Scrapping :Zoo-Museum District - Missouri History Museum Subdistrict Board of Commissioners
Scrapping :Zoo-Museum District - St. Louis Science Center Subdistrict Board of Commissioners
Scrapping :Zoo-Museum District - Zoological Park Subdistrict Board of Commissioners
References
You can find a couple of relevant discussions in:
Message: Element could not be scrolled into view while trying to click on an option within a dropdown menu through Selenium
How to open the option items of a select tag (dropdown) in different tabs/windows?

Want to store variable names in list, not said variable's contents

Sorry if the title is confusing; let me explain.
So, I've written a program that categorizes emails by topic using nltk and tools from sklearn.
Here is that code:
#Extract Emails
tech = extract_message("C:\\Users\\Cody\\Documents\\Emails\\tech.html")
gary = extract_message("C:\\Users\\Cody\\Documents\\Emails\\gary.html")
gary2 = extract_message("C:\\Users\\Cody\\Documents\\Emails\\gary2.html")
jesus = extract_message("C:\\Users\\Cody\\Documents\\Emails\\Jesus.html")
jesus2 = extract_message("C:\\Users\\Cody\\Documents\\Emails\\jesus2.html")
hockey = extract_message("C:\\Users\\Cody\\Documents\\Emails\\hockey.html")
hockey2 = extract_message("C:\\Users\\Cody\\Documents\\Emails\\hockey2.html")
shop = extract_message("C:\\Users\\Cody\\Documents\\Emails\\shop.html")
#Build dictionary of features
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(news.data)
#Downscaling
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
tf_transformer = TfidfTransformer(use_idf=False).fit(x_train_counts)
x_train_tf = tf_transformer.transform(x_train_counts)
#Train classifier
clf = MultinomialNB().fit(x_train_tfidf, news.target)
#List of the extracted emails
docs_new = [gary, gary2, jesus, jesus2, shop, tech, hockey, hockey2]
#Extract feautures from emails
x_new_counts = count_vect.transform(docs_new)
x_new_tfidf = tfidf_transformer.transform(x_new_counts)
#Predict the categories for each email
predicted = clf.predict(x_new_tfidf)
Now I'm looking to store each variable in an appropriate list, based off of the predicted label. I figured I could do that doing this:
#Store Files in a category
hockey_emails = []
computer_emails = []
politics_emails = []
tech_emails = []
religion_emails = []
forsale_emails = []
#Print out results and store each email in the appropritate category list
for doc, category in zip(docs_new, predicted):
print('%r ---> %s' % (doc, news.target_names[category]))
if(news.target_names[category] == 'comp.sys.ibm.pc.hardware'):
computer_emails.append(doc)
if(news.target_names[category] == 'rec.sport.hockey'):
hockey_emails.append(doc)
if(news.target_names[category] == 'talk.politics.misc'):
politics_emails.append(doc)
if(news.target_names[category] == 'soc.religion.christian'):
religion_emails.append(doc)
if(news.target_names[category] == 'misc.forsale'):
forsale_emails.append(doc)
if(news.target_names[category] == 'comp.sys.ibm.pc.hardware'):
computer_emails.append(doc)
My output if I were to print out one of these lists, let's say hockey for instance, displays the contents stored in the variable rather than the variable itself.
I want this:
print(hockey_emails)
output: ['hockey', 'hockey2']
but instead I'm getting this:
output: ['View View online click here Hi Thanks for signing up as a EA SPORTS NHL insider You ll now receive all of the latest and greatest news and info at this e mail address as you ve requested EA com If you need technical assistance please contact EA Help Privacy Policy Our Certified Online Privacy Policy gives you confidence whenever you play EA games To view our complete Privacy and Cookie Policy go to privacy ea com or write to Privacy Policy Administrator Electronic Arts Inc Redwood Shores Parkway Redwood City CA Electronic Arts Inc All Rights Reserved Privacy Policy User Agreement Legal ActionsMark as UnreadMark as ReadMark as SpamStarClear StarArchive Previous Next ', 'View News From The Hockey Writers The Editor s Choice stories from The Hockey Writers View this email in your browser edition Recap Stars Steamroll Predators By Matt Pryor on Dec am As the old Mary Chapin Carpenter song goes Sometimes you re the windshield Sometimes you re the bug It hasn t happened very often this season but the Dallas Stars had a windshield Continue Reading A Review of Years in Blue and White Damien Cox One on One By Anthony Fusco on Dec pm The Toronto Maple Leafs are one of the most storied and iconic franchises in the entire National Hockey League They have a century of history that spans all the way back to the early s When you have an Continue Reading Bruins Will Not Miss Beleskey By Kyle Benson on Dec am On Monday it was announced that Matt Beleskey will miss the next six weeks due to a knee injury he sustained over the weekend in a game against the Buffalo Sabres Six weeks is a long stint to be without a potential top Continue Reading Recent Articles Galchenyuk Injury Costly for CanadiensFacing Off Picking Team Canada for World JuniorsAre Johnson s Nomadic Days Over Share Tweet Forward Latest News Prospects Anaheim Ducks Arizona Coyotes Boston Bruins Buffalo Sabres Calgary Flames Carolina Hurricanes Chicago Blackhawks Colorado Avalanche Columbus Blue Jackets Dallas Stars Detroit Red Wings Edmonton Oilers Florida Panthers Los Angeles Kings Minnesota Wild Montreal Canadiens Nashville Predators New Jersey Devils New York Islanders New York Rangers Philadelphia Flyers Pittsburgh Penguins Ottawa Senators San Jose Sharks St Louis Blues Tampa Bay Lightning Toronto Maple Leafs Vancouver Canucks Washington Capitals Winnipeg Jets Copyright The Hockey Writers All rights reserved You are receiving this email because you opted in at The Hockey Writers or one of our Network Sites Our mailing address is The Hockey Writers Victoria Ave St Lambert QC J R R CanadaAdd us to your address book unsubscribe from this list update subscription preferences ActionsMark as UnreadMark as ReadMark as SpamStarClear StarArchive Previous Next ']
I figured this would be simple, but I'm sitting here scratching my head. Is this even possible? Should I use something else instead of a list? This is probably simple I'm just blanking.
You have to keep track of the names yourself, Python won't do it for you.
names = 'gary gary2 Jesus jesus2 shop tech hockey hockey2'.split()
docs_new = [extract_message("C:\\Users\\Cody\\Documents\\Emails\\%s.html" % name)
for name in names]
for name, category in zip(names, predicted):
print('%r ---> %s' % (name, news.target_names[category]))
if (news.target_names[category] == 'comp.sys.ibm.pc.hardware'):
computer_emails.append(name)
Don't do this. Use a dictionary to hold your collection of emails, and you can print the dictionary keys when you want to know what is what.
docs_new = dict()
docs_new["tech"] = extract_message("C:\\Users\\Cody\\Documents\\Emails\\tech.html")
docs_new["gary"] = extract_message("C:\\Users\\Cody\\Documents\\Emails\\gary.html")
etc.
When you iterate over the dictionary, you'll see the keys.
for doc, category in zip(docs_new, predicted):
print('%s ---> %s' % (doc, news.target_names[category]))
(More dictionary basics: To iterate over dict values, replace docs_new above with docs_new.values(); or use docs_new.items() for both keys and values.)

Searching online from text file with python checking if the names are cities in a specific country

So I have a large list of geographical locations that may be cities, counties, etc, in a particular country. For example country: Turkey and list of names is: Erzurum, Eskisehir, etc.
I am wondering if I can put this list into a text file and use Python to check these names online to check what geographical entity they are instead of just Googling each term individually. How would I do that?
You can also use google geocoding api. Here is an example:
a.txt:
erzurum
istanbul
turkey
chicago
united states
india
kayseri
spain
Here is the code:
import urllib2
import json
import time
def getEntity( entityText ):
url = 'https://maps.googleapis.com/maps/api/geocode/json?address=%s' % urllib2.quote(entityText)
response = urllib2.urlopen(url)
jsonaddress = json.loads(response.read())
time.sleep(0.2)
if jsonaddress['status'] == 'OK':
return jsonaddress['results'][0]['types'][0]
else:
return None
with open('a.txt') as f:
for line in f:
entityText = line.strip()
entity = getEntity( entityText )
print entityText, entity
Output:
erzurum locality
istanbul locality
turkey country
chicago locality
united states country
india country
kayseri locality
spain country

Cannot scrape specific content from site - BeautifulSoup 4

I am having hard luck scraping this link via Python 3, BeautifulSoup 4
http://www.radisson.com/lansing-hotel-mi-48933/lansing/hotel/dining
I only want to get this section.
When you are in ...
Capitol City Grille
This downtown Lansing restaurant offers ...
Capitol City Grille Lounge
For a glass of wine or a ...
Room Service
If you prefer ...
I have this code
for rest in dining_page_soup.select("div.copy_left p strong"):
if rest.next_sibling is not None:
if rest.next_sibling.next_sibling is not None:
title = rest.text
desc = rest.next_sibling.next_sibling
print ("Title: "+title)
print (desc)
But it gives me TypeError: 'NoneType' object is not callable
on desc = rest.next_sibling.next_sibling even I have an if statement to check whether it is None or not.
Here it is a very simple solution
from bs4 import BeautifulSoup
import requests
r = requests.get("http://www.radisson.com/lansing-hotel-mi-48933/lansing/hotel/dining")
data = r.text
soup = BeautifulSoup(data)
for found_text in soup.select('div.copy_left'):
print found_text.text
UPDATE
According to an improvement of the question, here it is a solution using RE.
A specific workaround have to be made for the 1st paragraph "When you..." since it does not respect the structure of other paragraphs.
for tag in soup.find_all(re.compile("^strong")):
title = tag.text
desc = tag.next_sibling.next_sibling
print ("Title: "+title)
print (desc)
Output
Title: Capitol City Grille
This downtown Lansing restaurant offers delicious, contemporary
American cuisine in an upscale yet relaxed environment. You can enjoy
dishes that range from fluffy pancakes to juicy filet mignon steaks.
Breakfast and lunch buffets are available, as well as an à la carte
menu.
Title: Capitol City Grille Lounge
For a glass of wine or a hand-crafted cocktail and great conversation,
spend an afternoon or evening at Capitol City Grille Lounge with
friends or colleagues.
Title: Room Service
If you prefer to dine in the comfort of your own room, order from the
room service menu.
Title: Menus
Breakfast Menu
Title: Capitol City Grille Hours
Breakfast, 6:30-11 a.m.
Title: Capitol City Grille Lounge Hours
Mon-Thu, 11 a.m.-11 p.m.
Title: Room Service Hours
Daily, 6:30 a.m.-2 p.m. and 5-10 p.m.
If you don't mind using xpath, this should work
import requests
from lxml import html
url = "http://www.radisson.com/lansing-hotel-mi-48933/lansing/hotel/dining"
page = requests.get(url).text
tree = html.fromstring(page)
xp_t = "//*[#class='copy_left']/descendant-or-self::node()/strong[not(following-sibling::a)]/text()"
xp_d = "//*[#class='copy_left']/descendant-or-self::node()/strong[not(following-sibling::a)]/../text()[not(following-sibling::strong)]"
titles = tree.xpath(xp_t)
descriptions = tree.xpath(xp_d) # still contains garbage like '\r\n'
descriptions = [d.strip() for d in descriptions if d.strip()]
for t, d in zip(titles, descriptions):
print("{title}: {description}".format(title=t, description=d))
Here descriptions contains 3 elements: "This downtown...", "For a glass...", "If you prefer...".
If you need also "When you are in the mood...", replace with this:
xp_d = "//*[#class='copy_left']/descendant-or-self::node()/strong[not(following-sibling::a)]/../text()"

Categories

Resources