Extracting JavaScript Variables into Python Dictionaries - python
Understanding that I have to use PyQt5 in conjunction with BeautifulSoup to run javascript on my client after extracting the html using BeautifulSoup, I am trying to convert variable _Flourish_data into a Python dictionary.
Is there an easy way to extract the Javascript variable, _Flourish_data, into a Python dictionary? Here is my current Python to extract the Javascript using PyQt5 and BeautifulSoup:
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
def Callable(self, html_str):
self.html = html_str
self.app.quit()
page = Page('https://flo.uri.sh/visualisation/2451841/embed?auto=1')
soup = bs.BeautifulSoup(page.html, 'html.parser')
js_test = soup.find_all('script')
js_test[5]
The output of the existing code is
<script>
function _Flourish_unflattenInto(dest, src) {
dest = dest || {};
for (var k in src) {
var t = dest;
for (var i = k.indexOf("."), p = 0; i >= 0; i = k.indexOf(".", p = i+1)) {
var s = k.substring(p, i);
if (!(s in t)) t[s] = {};
t = t[s];
}
t[k.substring(p)] = src[k];
}
return dest;
}
var _Flourish_settings = {"cell_fill_1":"#ffffff","cell_fill_2":"#ebebeb","cell_fill_direction":"horizontal","cell_font_size":"1","cell_height":20,"cell_horizontal_alignment":"center","cell_link_color":"#2886b2","cell_padding_horizontal":16,"cell_padding_vertical":11,"column_width_mode":"auto","column_widths":"10%, 10%, 10%, 10%, 50%, 10%","header_fill":"#181f6c","header_font_color":"#ffffff","header_font_default":false,"header_font_size":1.1,"header_horizontal_alignment":"center","header_style_default":true,"layout.body_font":{"name":"Source Sans Pro","url":"https://fonts.googleapis.com/css?family=Source+Sans+Pro:400,700"},"layout.layout_order":"stack-default","layout.space_between_sections":"0.5","mobile.view":true,"no_results_text":"Use the search bar to find your state","pagination_amount":41,"pagination_amount_search":"5","search_enabled":false,"search_hide_table":false,"search_placeholder":"Search to find your state","search_resize":true,"search_width":15};
_Flourish_unflattenInto(window.template.state, _Flourish_settings);
var _Flourish_data_column_names = {"rows":{"columns":["State ","Earliest/Planned Start Date for 20/21 Academic Year ","","","",""]}},
_Flourish_data = {"rows":[{"columns":["Alabama","Varies by district","","","",""]},{"columns":["Alaska","Varies by district","","","",""]},{"columns":["American Samoa","Unknown","","","",""]},{"columns":["Arizona","Varies by district","","","",""]},{"columns":["Arkansas","Varies by district","","","",""]},{"columns":["Bureau of Indian Education","Varies by district","","","",""]},{"columns":["California","Varies by district","","","",""]},{"columns":["Colorado","Varies by district","","","",""]},{"columns":["Connecticut","Not yet determined","","","",""]},{"columns":["Delaware","Varies by district","","","",""]},{"columns":["Department of Defense Education Activity\n ","Varies by district","","","",""]},{"columns":["District of Columbia","8/31/2020","","","",""]},{"columns":["Florida","Unknown","","","",""]},{"columns":["Georgia","Unknown","","","",""]},{"columns":["Guam","Unknown","","","",""]},{"columns":["Hawaii","Not yet determined","","","",""]},{"columns":["Idaho","Varies by District","","","",""]},{"columns":["Illinois","Varies by district","","","",""]},{"columns":["Indiana","Not yet determined","","","",""]},{"columns":["Iowa","Varies by district","","","",""]},{"columns":["Kansas","Not yet determined","","","",""]},{"columns":["Kentucky","Unknown","","","",""]},{"columns":["Louisiana","Varies by district","","","",""]},{"columns":["Maine","Varies by district","","","",""]},{"columns":["Maryland","Not yet determined","","","",""]},{"columns":["Massachusetts","Not yet determined","","","",""]},{"columns":["Michigan","Not yet determined","","","",""]},{"columns":["Minnesota","Not yet determined","","","",""]},{"columns":["Mississippi ","Varies by district","","","",""]},{"columns":["Missouri","Varies by district","","","",""]},{"columns":["Montana","Varies by district","","","",""]},{"columns":["Nebraska","Varies by district","","","",""]},{"columns":["Nevada","Varies by district","","","",""]},{"columns":["New Hampshire","Not yet determined","","","",""]},{"columns":["New Jersey","Varies by district","","","",""]},{"columns":["New Mexico","Unknown","","","",""]},{"columns":["New York","Not yet determined","","","",""]},{"columns":["North Carolina","8/17/2020","","","",""]},{"columns":["North Dakota","Varies by district","","","",""]},{"columns":["Northern Marianas","Unknown","","","",""]},{"columns":["Ohio","Not yet determined","","","",""]},{"columns":["Oklahoma","Varies by district","","","",""]},{"columns":["Oregon","Not yet determined","","","",""]},{"columns":["Pennsylvania","Varies by district","","","",""]},{"columns":["Puerto Rico","Unknown","","","",""]},{"columns":["Rhode Island","Not yet determined","","","",""]},{"columns":["South Carolina","Not yet determined","","","",""]},{"columns":["South Dakota","Varies by district","","","",""]},{"columns":["Tennessee","Varies by district","","","",""]},{"columns":["Texas","Varies by district","","","",""]},{"columns":["U.S. Virgin Islands\n ","Not yet determined","","","",""]},{"columns":["Utah","Varies by district","","","",""]},{"columns":["Vermont","Not yet determined","","","",""]},{"columns":["Virginia","Not yet determined","","","",""]},{"columns":["Washington","Varies by District","","","",""]},{"columns":["West Virginia","Not yet determined","","","",""]},{"columns":["Wisconsin","Varies by district","","","",""]},{"columns":["Wyoming","Not yet determined","","","",""]}]};
for (var _Flourish_dataset in _Flourish_data) {
window.template.data[_Flourish_dataset] = _Flourish_data[_Flourish_dataset];
window.template.data[_Flourish_dataset].column_names = _Flourish_data_column_names[_Flourish_dataset];
}
window.template.draw();
</script>
I just want var _flourish_data from HTML tag, as shown below:
_Flourish_data = {"rows":[{"columns":["Alabama","Varies by district","","","",""]},{"columns":["Alaska","Varies by district","","","",""]},{"columns":["American Samoa","Unknown","","","",""]},{"columns":["Arizona","Varies by district","","","",""]},{"columns":["Arkansas","Varies by district","","","",""]},{"columns":["Bureau of Indian Education","Varies by district","","","",""]},{"columns":["California","Varies by district","","","",""]},{"columns":["Colorado","Varies by district","","","",""]},{"columns":["Connecticut","Not yet determined","","","",""]},{"columns":["Delaware","Varies by district","","","",""]},{"columns":["Department of Defense Education Activity\n ","Varies by district","","","",""]},{"columns":["District of Columbia","8/31/2020","","","",""]},{"columns":["Florida","Unknown","","","",""]},{"columns":["Georgia","Unknown","","","",""]},{"columns":["Guam","Unknown","","","",""]},{"columns":["Hawaii","Not yet determined","","","",""]},{"columns":["Idaho","Varies by District","","","",""]},{"columns":["Illinois","Varies by district","","","",""]},{"columns":["Indiana","Not yet determined","","","",""]},{"columns":["Iowa","Varies by district","","","",""]},{"columns":["Kansas","Not yet determined","","","",""]},{"columns":["Kentucky","Unknown","","","",""]},{"columns":["Louisiana","Varies by district","","","",""]},{"columns":["Maine","Varies by district","","","",""]},{"columns":["Maryland","Not yet determined","","","",""]},{"columns":["Massachusetts","Not yet determined","","","",""]},{"columns":["Michigan","Not yet determined","","","",""]},{"columns":["Minnesota","Not yet determined","","","",""]},{"columns":["Mississippi ","Varies by district","","","",""]},{"columns":["Missouri","Varies by district","","","",""]},{"columns":["Montana","Varies by district","","","",""]},{"columns":["Nebraska","Varies by district","","","",""]},{"columns":["Nevada","Varies by district","","","",""]},{"columns":["New Hampshire","Not yet determined","","","",""]},{"columns":["New Jersey","Varies by district","","","",""]},{"columns":["New Mexico","Unknown","","","",""]},{"columns":["New York","Not yet determined","","","",""]},{"columns":["North Carolina","8/17/2020","","","",""]},{"columns":["North Dakota","Varies by district","","","",""]},{"columns":["Northern Marianas","Unknown","","","",""]},{"columns":["Ohio","Not yet determined","","","",""]},{"columns":["Oklahoma","Varies by district","","","",""]},{"columns":["Oregon","Not yet determined","","","",""]},{"columns":["Pennsylvania","Varies by district","","","",""]},{"columns":["Puerto Rico","Unknown","","","",""]},{"columns":["Rhode Island","Not yet determined","","","",""]},{"columns":["South Carolina","Not yet determined","","","",""]},{"columns":["South Dakota","Varies by district","","","",""]},{"columns":["Tennessee","Varies by district","","","",""]},{"columns":["Texas","Varies by district","","","",""]},{"columns":["U.S. Virgin Islands\n ","Not yet determined","","","",""]},{"columns":["Utah","Varies by district","","","",""]},{"columns":["Vermont","Not yet determined","","","",""]},{"columns":["Virginia","Not yet determined","","","",""]},{"columns":["Washington","Varies by District","","","",""]},{"columns":["West Virginia","Not yet determined","","","",""]},{"columns":["Wisconsin","Varies by district","","","",""]},{"columns":["Wyoming","Not yet determined","","","",""]}]};
Any help would be greatly appreciated!
You don't need to execute Javascript. It can be done with json and re module.
For example:
import re
import json
import requests
url = 'https://flo.uri.sh/visualisation/2451841/embed?auto=1'
html_data = requests.get(url).text
data = re.search(r'_Flourish_data = (\{.*?\});', html_data).group(1)
data = json.loads(data)
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
for row in data['rows']:
print('{:<55}{}'.format(*map(str.strip, row['columns'][:2])))
Prints:
Alabama Varies by district
Alaska Varies by district
American Samoa Unknown
Arizona Varies by district
Arkansas Varies by district
Bureau of Indian Education Varies by district
California Varies by district
Colorado Varies by district
Connecticut Not yet determined
Delaware Varies by district
Department of Defense Education Activity Varies by district
District of Columbia 8/31/2020
Florida Unknown
Georgia Unknown
Guam Unknown
Hawaii Not yet determined
Idaho Varies by District
Illinois Varies by district
Indiana Not yet determined
Iowa Varies by district
Kansas Not yet determined
Kentucky Unknown
Louisiana Varies by district
Maine Varies by district
Maryland Not yet determined
Massachusetts Not yet determined
Michigan Not yet determined
Minnesota Not yet determined
Mississippi Varies by district
Missouri Varies by district
Montana Varies by district
Nebraska Varies by district
Nevada Varies by district
New Hampshire Not yet determined
New Jersey Varies by district
New Mexico Unknown
New York Not yet determined
North Carolina 8/17/2020
North Dakota Varies by district
Northern Marianas Unknown
Ohio Not yet determined
Oklahoma Varies by district
Oregon Not yet determined
Pennsylvania Varies by district
Puerto Rico Unknown
Rhode Island Not yet determined
South Carolina Not yet determined
South Dakota Varies by district
Tennessee Varies by district
Texas Varies by district
U.S. Virgin Islands Not yet determined
Utah Varies by district
Vermont Not yet determined
Virginia Not yet determined
Washington Varies by District
West Virginia Not yet determined
Wisconsin Varies by district
Wyoming Not yet determined
import requests
import re
import json
def main(url):
r = requests.get(url)
match = json.loads(re.search(r'_Flourish_data = ({.*})', r.text).group(1))
print(match.keys())
main("https://flo.uri.sh/visualisation/2451841/embed?auto=1")
Related
Scraping a HTML site using BeautifulSoup and finding the value of "total_pages" in it
I'm writing a python code that scrapes the following website and looks for the value of "total_pages" in it. The website is https://www.usnews.com/best-colleges/fl When I open the website in a browser and investigate the source code, the value of "total_pages" is 8. I want my python code to be able to get the same value. I have written the following code: import requests from bs4 import BeautifulSoup headers ={'User-Agent': 'Mozilla/5.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'} main_site=requests.get("https://www.usnews.com/best-colleges/fl",headers=headers) main_site_content=main_site.content main_site_content_soup=BeautifulSoup(main_site_content,"html.parser") But then I get stuck on how to look for the "total_pages" in the parsed data. I have tried find_all() method but no luck. I think I'm not using the method correctly. One note: the solution does not have to use BeautifulSoup. I just used BeautifulSoup since I was a bit familiar with it.
No need for BeautifulSoup. Here I make a request to their API to get the list of universities. from rich import print is used to pretty-print the JSON. It should make it easier to read. Need more help or advice, leave a comment below. import requests from rich import print LINK = "https://www.usnews.com/best-colleges/api/search?format=json&location=Florida&_sort=rank&_sortDirection=asc&_page=1" def get_data(url): print("Making request to:", url) response = requests.get(url, timeout=5, headers={"User-Agent": "Mozilla/5.0"}) if response.status_code == 200: print("Request Successful!") data = response.json()["data"] return data["items"], data["next_link"] print("Request failed!") return None, None def main(): print("Starting Scraping...") items, next_link = get_data(LINK) # if there's a `next_link`, scrape it. while next_link is not None: print("Getting data from:", next_link) new_items, next_link = get_data(next_link) items += new_items # cleaning the data, for the pandas dataframe. items = [ { "name": item["institution"]["displayName"], "state": item["institution"]["state"], "city": item["institution"]["city"], } for item in items ] df = pd.DataFrame(items) print(df.to_markdown()) if __name__ == "__main__": main() The output looks like this: name state city 0 University of Florida FL Gainesville 1 Florida State University FL Tallahassee 2 University of Miami FL Coral Gables 3 University of South Florida FL Tampa 4 University of Central Florida FL Orlando 5 Florida International University FL Miami 6 Florida A&M University FL Tallahassee 7 Florida Institute of Technology FL Melbourne 8 Nova Southeastern University FL Ft. Lauderdale ... ... ... ... 74 St. John Vianney College Seminary FL Miami 75 St. Petersburg College FL St. Petersburg 76 Tallahassee Community College FL Tallahassee 77 Valencia College FL Orlando
How to scrape information from tables selecting each of the Dropdown options using Selenium and Python?
Trying to help someone who works for a nonprofit. Currently trying to pull info from the STL County Boards/Commissions website(https://boards.stlouisco.com/). Having trouble for a few reasons: Was going to attempt to use BeautifulSoup, but the actual data isn't even shown until you choose a Board/Commission from a dropdown bar above, so I have switched to Selenium, which I am new at. Is this task possible? When I look at the html code for the site, I see that the info isn't stored in the page, but pulled from another location and just displayed on the site based on the option chosen from the dropdown menu. function ShowMemberList(selectedBoard) { ClearMeetingsAndMembers(); var htmlString = ""; var boardsList = [{"id":407,"name":"Aging Ahead","isActive":true,"description":"... ...1.","totalSeats":14}]; var totalMembers = boardsList[$("select[name='BoardsList'] option:selected").index() - 1].totalSeats; $.get("/api/boards/" + selectedBoard + "/members", function (data) { if (data.length > 0) { htmlString += "<table id=\"MemberTable\" class=\"table table-hover\">"; htmlString += "<thead><th>Member Name</th><th>Title</th><th>Position</th><th>Expiration Date</th></thead><tbody>"; for (var i = 0; i < totalMembers; i++) { if (i < data.length) { htmlString += "<tr><td>" + FormatString(data[i].firstName) + " " + FormatString(data[i].lastName) + "</td><td>" + FormatString(data[i].title) + "</td><td>" + FormatString(data[i].position) + "</td><td>" + FormatString(data[i].expirationDate) + "</td></tr>"; } else { htmlString += "<tr><td colspan=\"4\">---Vacant Seat---</td></tr>" } } htmlString += "</tbody></table>"; } else { htmlString = "<span id=\"MemberTable\">There was no data found for this board.</span>"; } $("#Results").append(htmlString); }); } So far, I have this (not a lot), which goes to the page and selects every board from the list: driver = webdriver.Chrome() driver.get("https://boards.stlouisco.com/") select = Select(wait(driver, 10).until(EC.presence_of_element_located((By.ID, 'BoardsList')))) options = select.options for board in options: select.select_by_visible_text(board.text) From here I would like to be able to scrape the info from the MemberTable but I don't know how to move forward/if it is something in the scope of my abilities, or even if it is something possible with Selenium. I've tried using find_by a few different elements to click on the members table but am met with errors. I have also tried calling for the memberstable after my select, but it is not able to find that element. Any tips/pointers/advice is appreciated!
You can use this script to save all members from all boards to csv: import json import requests import pandas as pd from bs4 import BeautifulSoup url = 'https://boards.stlouisco.com/' members_url = 'https://boards.stlouisco.com/api/boards/{}/members' soup = BeautifulSoup(requests.get(url).content, 'html.parser') all_data = [] for o in soup.select('#BoardsList option[value]'): print(o['value'], o.text) data = requests.get(members_url.format(o['value'])).json() for d in data: all_data.append(dict(board=o.text, **d)) df = pd.DataFrame(all_data) print(df) df.to_csv('data.csv') Prints: board boardMemberId memberId boardName ... lastName title position expirationDate 0 Aging Ahead 39003 27007 None ... Anderson None ST. LOUIS COUNTY EXECUTIVE APPOINTEE 10/1/2020 1 Aging Ahead 38963 27797 None ... Bauers None St. Charles County Community Action Agency App... None 2 Aging Ahead 39004 27815 None ... Berkowitz None ST. LOUIS COUNTY EXECUTIVE APPOINTEE 10/1/2020 3 Aging Ahead 38964 27798 None ... Biehle None Jefferson County Community Action Corp. Appointee None 4 Aging Ahead 38581 27597 None ... Bowers None Franklin County Commission Appointee None .. ... ... ... ... ... ... ... ... ... 725 Zoo-Museum District - Zoological Park Subdistr... 38863 26745 None ... Seat (Robert R. Hermann, Jr.) St. Louis County 12/31/2019 726 Zoo-Museum District - Zoological Park Subdistr... 38864 26745 None ... Seat (Winthrop Reed) St. Louis County 12/31/2016 727 Zoo-Museum District - Zoological Park Subdistr... 38669 26745 None ... Seat (Lawrence Thomas) St. Louis County 12/31/2018 728 Zoo-Museum District - Zoological Park Subdistr... 38670 26745 None ... Seat (Peggy Ritter ) Advisory Commissioner Non-Voting St. Louis County 12/31/2019 729 Zoo-Museum District - Zoological Park Subdistr... 38394 27512 None ... Wilson Advisory Commissioner Non-Voting City of St. Louis None [730 rows x 9 columns] And saves data.csv with all boards/members (screenshot from LibreOffice):
To choose each of the Board / Commission from the html-select Dropdown and scrape the page you have to induce WebDriverWait for the element_to_be_clickable() and you can use the following Locator Strategies: Code: from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import Select options = webdriver.ChromeOptions() options.add_argument("start-maximized") options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option('useAutomationExtension', False) driver = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\chromedriver.exe') driver.get("https://boards.stlouisco.com/") select = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'BoardsList')))) for option in select.options: option.click() print("Scrapping :"+option.text) Console Output: Scrapping :---Choose a Board--- Scrapping :Aging Ahead Scrapping :Aging Ahead Advisory Council Scrapping :Air Pollution & Noise Control Appeal Board Scrapping :Animal Care & Control Advisory Board Scrapping :Bi-State Development Agency (Metro) Scrapping :Board Of Examiners For Mechanical Licensing Scrapping :Board of Freeholders Scrapping :Boundary Commission Scrapping :Building Code Review Committee Scrapping :Building Commission & Board Of Building Appeals Scrapping :Business Advisory Council Scrapping :Center for Educational Media Scrapping :Civil Service Commission Scrapping :Commission On Disabilities Scrapping :County Health Advisory Board Scrapping :Domestic And Family Violence Council Scrapping :East-West Gateway Council of Governments Board of Directors Scrapping :Economic Development Collaborative Advisory Board Scrapping :Economic Rescue Team Scrapping :Electrical Code Review Committee Scrapping :Electrical Examiners, Board Of Scrapping :Emergency Communications System Commission Scrapping :Equalization, Board Of Scrapping :Fire Standards Commission Scrapping :Friends of the Kathy J. Weinman Shelter for Battered Women, Inc. Scrapping :Fund Investment Advisory Committee Scrapping :Historic Building Commission Scrapping :Housing Authority Scrapping :Housing Resources Commission Scrapping :Human Relations Commission Scrapping :Industrial Development Authority Board Scrapping :Justice Services Advisory Board Scrapping :Lambert Airport Eastern Perimeter Joint Development Commission Scrapping :Land Clearance For Redevelopment Authority Scrapping :Lemay Community Improvement District Scrapping :Library Board Scrapping :Local Emergency Planning Committee Scrapping :Mechanical Code Review Committee Scrapping :Metropolitan Park And Recreation District Board Of Directors (Great Rivers Greenway) Scrapping :Metropolitan St. Louis Sewer District Scrapping :Metropolitan Taxicab Commission Scrapping :Metropolitan Zoological Park and Museum District Board Scrapping :Municipal Court Judges Scrapping :Older Adult Commission Scrapping :Parks And Recreation Advisory Board Scrapping :Planning Commission Scrapping :Plumbing Code Review Committee Scrapping :Plumbing Examiners, Board Of Scrapping :Police Commissioners, Board Of Scrapping :Port Authority Board Of Commissioners Scrapping :Private Security Advisory Committee Scrapping :Productive Living Board Scrapping :Public Transportation Commission of St. Louis County Scrapping :Regional Arts Commission Scrapping :Regional Convention & Sports Complex Authority Scrapping :Regional Convention & Visitors Commission Scrapping :REJIS Commission Scrapping :Restaurant Commission Scrapping :Retirement Board Of Trustees Scrapping :St. Louis Airport Commission Scrapping :St. Louis County Children's Service Fund Board Scrapping :St. Louis County Clean Energy Development Board (PACE) Scrapping :St. Louis County Workforce Development Board Scrapping :St. Louis Economic Development Partnership Scrapping :St. Louis Regional Health Commission Scrapping :St. Louis-Jefferson Solid Waste Management District Scrapping :Tax Increment Financing Commission of St. Louis County Scrapping :Transportation Board Scrapping :Waste Management Commission Scrapping :World Trade Center - St. Louis Scrapping :Zoning Adjustment, Board of Scrapping :Zoo-Museum District - Art Museum Subdistrict Board of Commissioners Scrapping :Zoo-Museum District - Botanical Garden Subdistrict Board of Commissioners Scrapping :Zoo-Museum District - Missouri History Museum Subdistrict Board of Commissioners Scrapping :Zoo-Museum District - St. Louis Science Center Subdistrict Board of Commissioners Scrapping :Zoo-Museum District - Zoological Park Subdistrict Board of Commissioners References You can find a couple of relevant discussions in: Message: Element could not be scrolled into view while trying to click on an option within a dropdown menu through Selenium How to open the option items of a select tag (dropdown) in different tabs/windows?
Want to store variable names in list, not said variable's contents
Sorry if the title is confusing; let me explain. So, I've written a program that categorizes emails by topic using nltk and tools from sklearn. Here is that code: #Extract Emails tech = extract_message("C:\\Users\\Cody\\Documents\\Emails\\tech.html") gary = extract_message("C:\\Users\\Cody\\Documents\\Emails\\gary.html") gary2 = extract_message("C:\\Users\\Cody\\Documents\\Emails\\gary2.html") jesus = extract_message("C:\\Users\\Cody\\Documents\\Emails\\Jesus.html") jesus2 = extract_message("C:\\Users\\Cody\\Documents\\Emails\\jesus2.html") hockey = extract_message("C:\\Users\\Cody\\Documents\\Emails\\hockey.html") hockey2 = extract_message("C:\\Users\\Cody\\Documents\\Emails\\hockey2.html") shop = extract_message("C:\\Users\\Cody\\Documents\\Emails\\shop.html") #Build dictionary of features count_vect = CountVectorizer() x_train_counts = count_vect.fit_transform(news.data) #Downscaling tfidf_transformer = TfidfTransformer() x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts) tf_transformer = TfidfTransformer(use_idf=False).fit(x_train_counts) x_train_tf = tf_transformer.transform(x_train_counts) #Train classifier clf = MultinomialNB().fit(x_train_tfidf, news.target) #List of the extracted emails docs_new = [gary, gary2, jesus, jesus2, shop, tech, hockey, hockey2] #Extract feautures from emails x_new_counts = count_vect.transform(docs_new) x_new_tfidf = tfidf_transformer.transform(x_new_counts) #Predict the categories for each email predicted = clf.predict(x_new_tfidf) Now I'm looking to store each variable in an appropriate list, based off of the predicted label. I figured I could do that doing this: #Store Files in a category hockey_emails = [] computer_emails = [] politics_emails = [] tech_emails = [] religion_emails = [] forsale_emails = [] #Print out results and store each email in the appropritate category list for doc, category in zip(docs_new, predicted): print('%r ---> %s' % (doc, news.target_names[category])) if(news.target_names[category] == 'comp.sys.ibm.pc.hardware'): computer_emails.append(doc) if(news.target_names[category] == 'rec.sport.hockey'): hockey_emails.append(doc) if(news.target_names[category] == 'talk.politics.misc'): politics_emails.append(doc) if(news.target_names[category] == 'soc.religion.christian'): religion_emails.append(doc) if(news.target_names[category] == 'misc.forsale'): forsale_emails.append(doc) if(news.target_names[category] == 'comp.sys.ibm.pc.hardware'): computer_emails.append(doc) My output if I were to print out one of these lists, let's say hockey for instance, displays the contents stored in the variable rather than the variable itself. I want this: print(hockey_emails) output: ['hockey', 'hockey2'] but instead I'm getting this: output: ['View View online click here Hi Thanks for signing up as a EA SPORTS NHL insider You ll now receive all of the latest and greatest news and info at this e mail address as you ve requested EA com If you need technical assistance please contact EA Help Privacy Policy Our Certified Online Privacy Policy gives you confidence whenever you play EA games To view our complete Privacy and Cookie Policy go to privacy ea com or write to Privacy Policy Administrator Electronic Arts Inc Redwood Shores Parkway Redwood City CA Electronic Arts Inc All Rights Reserved Privacy Policy User Agreement Legal ActionsMark as UnreadMark as ReadMark as SpamStarClear StarArchive Previous Next ', 'View News From The Hockey Writers The Editor s Choice stories from The Hockey Writers View this email in your browser edition Recap Stars Steamroll Predators By Matt Pryor on Dec am As the old Mary Chapin Carpenter song goes Sometimes you re the windshield Sometimes you re the bug It hasn t happened very often this season but the Dallas Stars had a windshield Continue Reading A Review of Years in Blue and White Damien Cox One on One By Anthony Fusco on Dec pm The Toronto Maple Leafs are one of the most storied and iconic franchises in the entire National Hockey League They have a century of history that spans all the way back to the early s When you have an Continue Reading Bruins Will Not Miss Beleskey By Kyle Benson on Dec am On Monday it was announced that Matt Beleskey will miss the next six weeks due to a knee injury he sustained over the weekend in a game against the Buffalo Sabres Six weeks is a long stint to be without a potential top Continue Reading Recent Articles Galchenyuk Injury Costly for CanadiensFacing Off Picking Team Canada for World JuniorsAre Johnson s Nomadic Days Over Share Tweet Forward Latest News Prospects Anaheim Ducks Arizona Coyotes Boston Bruins Buffalo Sabres Calgary Flames Carolina Hurricanes Chicago Blackhawks Colorado Avalanche Columbus Blue Jackets Dallas Stars Detroit Red Wings Edmonton Oilers Florida Panthers Los Angeles Kings Minnesota Wild Montreal Canadiens Nashville Predators New Jersey Devils New York Islanders New York Rangers Philadelphia Flyers Pittsburgh Penguins Ottawa Senators San Jose Sharks St Louis Blues Tampa Bay Lightning Toronto Maple Leafs Vancouver Canucks Washington Capitals Winnipeg Jets Copyright The Hockey Writers All rights reserved You are receiving this email because you opted in at The Hockey Writers or one of our Network Sites Our mailing address is The Hockey Writers Victoria Ave St Lambert QC J R R CanadaAdd us to your address book unsubscribe from this list update subscription preferences ActionsMark as UnreadMark as ReadMark as SpamStarClear StarArchive Previous Next '] I figured this would be simple, but I'm sitting here scratching my head. Is this even possible? Should I use something else instead of a list? This is probably simple I'm just blanking.
You have to keep track of the names yourself, Python won't do it for you. names = 'gary gary2 Jesus jesus2 shop tech hockey hockey2'.split() docs_new = [extract_message("C:\\Users\\Cody\\Documents\\Emails\\%s.html" % name) for name in names] for name, category in zip(names, predicted): print('%r ---> %s' % (name, news.target_names[category])) if (news.target_names[category] == 'comp.sys.ibm.pc.hardware'): computer_emails.append(name)
Don't do this. Use a dictionary to hold your collection of emails, and you can print the dictionary keys when you want to know what is what. docs_new = dict() docs_new["tech"] = extract_message("C:\\Users\\Cody\\Documents\\Emails\\tech.html") docs_new["gary"] = extract_message("C:\\Users\\Cody\\Documents\\Emails\\gary.html") etc. When you iterate over the dictionary, you'll see the keys. for doc, category in zip(docs_new, predicted): print('%s ---> %s' % (doc, news.target_names[category])) (More dictionary basics: To iterate over dict values, replace docs_new above with docs_new.values(); or use docs_new.items() for both keys and values.)
Searching online from text file with python checking if the names are cities in a specific country
So I have a large list of geographical locations that may be cities, counties, etc, in a particular country. For example country: Turkey and list of names is: Erzurum, Eskisehir, etc. I am wondering if I can put this list into a text file and use Python to check these names online to check what geographical entity they are instead of just Googling each term individually. How would I do that?
You can also use google geocoding api. Here is an example: a.txt: erzurum istanbul turkey chicago united states india kayseri spain Here is the code: import urllib2 import json import time def getEntity( entityText ): url = 'https://maps.googleapis.com/maps/api/geocode/json?address=%s' % urllib2.quote(entityText) response = urllib2.urlopen(url) jsonaddress = json.loads(response.read()) time.sleep(0.2) if jsonaddress['status'] == 'OK': return jsonaddress['results'][0]['types'][0] else: return None with open('a.txt') as f: for line in f: entityText = line.strip() entity = getEntity( entityText ) print entityText, entity Output: erzurum locality istanbul locality turkey country chicago locality united states country india country kayseri locality spain country
Cannot scrape specific content from site - BeautifulSoup 4
I am having hard luck scraping this link via Python 3, BeautifulSoup 4 http://www.radisson.com/lansing-hotel-mi-48933/lansing/hotel/dining I only want to get this section. When you are in ... Capitol City Grille This downtown Lansing restaurant offers ... Capitol City Grille Lounge For a glass of wine or a ... Room Service If you prefer ... I have this code for rest in dining_page_soup.select("div.copy_left p strong"): if rest.next_sibling is not None: if rest.next_sibling.next_sibling is not None: title = rest.text desc = rest.next_sibling.next_sibling print ("Title: "+title) print (desc) But it gives me TypeError: 'NoneType' object is not callable on desc = rest.next_sibling.next_sibling even I have an if statement to check whether it is None or not.
Here it is a very simple solution from bs4 import BeautifulSoup import requests r = requests.get("http://www.radisson.com/lansing-hotel-mi-48933/lansing/hotel/dining") data = r.text soup = BeautifulSoup(data) for found_text in soup.select('div.copy_left'): print found_text.text UPDATE According to an improvement of the question, here it is a solution using RE. A specific workaround have to be made for the 1st paragraph "When you..." since it does not respect the structure of other paragraphs. for tag in soup.find_all(re.compile("^strong")): title = tag.text desc = tag.next_sibling.next_sibling print ("Title: "+title) print (desc) Output Title: Capitol City Grille This downtown Lansing restaurant offers delicious, contemporary American cuisine in an upscale yet relaxed environment. You can enjoy dishes that range from fluffy pancakes to juicy filet mignon steaks. Breakfast and lunch buffets are available, as well as an à la carte menu. Title: Capitol City Grille Lounge For a glass of wine or a hand-crafted cocktail and great conversation, spend an afternoon or evening at Capitol City Grille Lounge with friends or colleagues. Title: Room Service If you prefer to dine in the comfort of your own room, order from the room service menu. Title: Menus Breakfast Menu Title: Capitol City Grille Hours Breakfast, 6:30-11 a.m. Title: Capitol City Grille Lounge Hours Mon-Thu, 11 a.m.-11 p.m. Title: Room Service Hours Daily, 6:30 a.m.-2 p.m. and 5-10 p.m.
If you don't mind using xpath, this should work import requests from lxml import html url = "http://www.radisson.com/lansing-hotel-mi-48933/lansing/hotel/dining" page = requests.get(url).text tree = html.fromstring(page) xp_t = "//*[#class='copy_left']/descendant-or-self::node()/strong[not(following-sibling::a)]/text()" xp_d = "//*[#class='copy_left']/descendant-or-self::node()/strong[not(following-sibling::a)]/../text()[not(following-sibling::strong)]" titles = tree.xpath(xp_t) descriptions = tree.xpath(xp_d) # still contains garbage like '\r\n' descriptions = [d.strip() for d in descriptions if d.strip()] for t, d in zip(titles, descriptions): print("{title}: {description}".format(title=t, description=d)) Here descriptions contains 3 elements: "This downtown...", "For a glass...", "If you prefer...". If you need also "When you are in the mood...", replace with this: xp_d = "//*[#class='copy_left']/descendant-or-self::node()/strong[not(following-sibling::a)]/../text()"