How can I extract values from Tableau on this webpage - python
I am trying to extract the "mobility index" values for each state and county from this webpage:
https://www.cuebiq.com/visitation-insights-mobility-index/
The preferred output would be a panel data of place (state/county) by date for all available places and dates.
There is another thread (How can I scrape tooltips value from a Tableau graph embedded in a webpage) with a similar question. I tried to follow the solution there but it doesn't seem to work for my case.
Thanks a lot in advance.
(A way that I have tried is to download PDF files generated from Tableau, which would contain all counties' value on a specific date. However, I still need to find a way to make request for each date in the data. Anyway, let me know if you have a better idea than this route).
This tableau data url doesn't return any data. In fact, it only render images of the values (canvas probably) and I'm guessing it detects click based on coordinate. Probably, it's made this way to cache the value and render quickly.
But when you click on a state, it actually returns data but it seems it doesn't always returns the result for the state (but works the individual county).
The solution I've found is to use the tooltip to get the data for the state. When you click the state, it generates a request like this :
POST https://public.tableau.com/{path}/{session_id}/commands/tabsrv/render-tooltip-server
with the following form param :
worksheet: US Map - State - CMI
dashboard: CMI
tupleIds: [18]
vizRegionRect: {"r":"viz","x":496,"y":148,"w":0,"h":0,"fieldVector":null}
allowHoverActions: false
allowPromptText: true
allowWork: false
useInlineImages: true
where tupleIds: [18] refers to the index of the state in a list of states in reverse alphabetical order like this :
stateNames = ["Wyoming","Wisconsin","West Virginia","Washington","Virginia","Vermont","Utah","Texas","Tennessee","South Dakota","South Carolina","Rhode Island","Pennsylvania","Oregon","Oklahoma","Ohio","North Dakota","North Carolina","New York","New Mexico","New Jersey","New Hampshire","Nevada","Nebraska","Montana","Missouri","Mississippi","Minnesota","Michigan","Massachusetts","Maryland","Maine","Louisiana","Kentucky","Kansas","Iowa","Indiana","Illinois","Idaho","Georgia","Florida","District of Columbia","Delaware","Connecticut","Colorado","California","Arkansas","Arizona","Alabama"]
It gives a json with the html of the tooltip which has the CMI and YoY values you want to extract :
{
"vqlCmdResponse": {
"cmdResultList": [{
"commandName": "tabsrv:render-tooltip-server",
"commandReturn": {
"tooltipText": "{\"htmlTooltip\": \"<HTML HERE WITH THE VALUES>\"}]},\"overlayAnchors\":[]}"
}
}]
}
}
The only caveat is that you'll hava to make one request per state :
import requests
from bs4 import BeautifulSoup
import json
import time
data_host = "https://public.tableau.com"
r = requests.get(
f"{data_host}/views/CMI-2_0/CMI",
params= {
":showVizHome":"no",
}
)
soup = BeautifulSoup(r.text, "html.parser")
tableauData = json.loads(soup.find("textarea",{"id": "tsConfigContainer"}).text)
dataUrl = f'{data_host}{tableauData["vizql_root"]}/bootstrapSession/sessions/{tableauData["sessionid"]}'
r = requests.post(dataUrl, data= {
"sheet_id": tableauData["sheetId"],
})
data = []
stateNames = ["Wyoming","Wisconsin","West Virginia","Washington","Virginia","Vermont","Utah","Texas","Tennessee","South Dakota","South Carolina","Rhode Island","Pennsylvania","Oregon","Oklahoma","Ohio","North Dakota","North Carolina","New York","New Mexico","New Jersey","New Hampshire","Nevada","Nebraska","Montana","Missouri","Mississippi","Minnesota","Michigan","Massachusetts","Maryland","Maine","Louisiana","Kentucky","Kansas","Iowa","Indiana","Illinois","Idaho","Georgia","Florida","District of Columbia","Delaware","Connecticut","Colorado","California","Arkansas","Arizona","Alabama"]
for stateIndex, state in enumerate(stateNames):
time.sleep(0.5) #for throttling
r = requests.post(f'{data_host}{tableauData["vizql_root"]}/sessions/{tableauData["sessionid"]}/commands/tabsrv/render-tooltip-server',
data = {
"worksheet": "US Map - State - CMI",
"dashboard": "CMI",
"tupleIds": f"[{stateIndex+1}]",
"vizRegionRect": json.dumps({"r":"viz","x":496,"y":148,"w":0,"h":0,"fieldVector":None}),
"allowHoverActions": "false",
"allowPromptText": "true",
"allowWork": "false",
"useInlineImages": "true"
})
tooltip = json.loads(r.json()["vqlCmdResponse"]["cmdResultList"][0]["commandReturn"]["tooltipText"])["htmlTooltip"]
soup = BeautifulSoup(tooltip, "html.parser")
rows = [
t.find("tr").find_all("td")
for t in soup.find_all("table")
]
entry = { "state": state }
for row in rows:
if (row[0].text == "Mobility Index:"):
entry["CMI"] = "".join([t.text.strip() for t in row[1:]])
if row[0].text == "YoY (%):":
entry["YoY"] = "".join([t.text.strip() for t in row[1:]])
print(entry)
data.append(entry)
print(data)
Try this on repl.it
To get the county information it's the same as this post using the select endpoint which gives you the data with the same format as the post you've linked in your question
The following will extract data for all county and state :
import requests
from bs4 import BeautifulSoup
import json
import time
data_host = "https://public.tableau.com"
worksheet = "US Map - State - CMI"
dashboard = "CMI"
r = requests.get(
f"{data_host}/views/CMI-2_0/CMI",
params= {
":showVizHome":"no",
}
)
soup = BeautifulSoup(r.text, "html.parser")
tableauData = json.loads(soup.find("textarea",{"id": "tsConfigContainer"}).text)
dataUrl = f'{data_host}{tableauData["vizql_root"]}/bootstrapSession/sessions/{tableauData["sessionid"]}'
r = requests.post(dataUrl, data= {
"sheet_id": tableauData["sheetId"],
})
data = []
stateNames = ["Wyoming","Wisconsin","West Virginia","Washington","Virginia","Vermont","Utah","Texas","Tennessee","South Dakota","South Carolina","Rhode Island","Pennsylvania","Oregon","Oklahoma","Ohio","North Dakota","North Carolina","New York","New Mexico","New Jersey","New Hampshire","Nevada","Nebraska","Montana","Missouri","Mississippi","Minnesota","Michigan","Massachusetts","Maryland","Maine","Louisiana","Kentucky","Kansas","Iowa","Indiana","Illinois","Idaho","Georgia","Florida","District of Columbia","Delaware","Connecticut","Colorado","California","Arkansas","Arizona","Alabama"]
for stateIndex, state in enumerate(stateNames):
time.sleep(0.5) #for throttling
r = requests.post(f'{data_host}{tableauData["vizql_root"]}/sessions/{tableauData["sessionid"]}/commands/tabsrv/render-tooltip-server',
data = {
"worksheet": worksheet,
"dashboard": dashboard,
"tupleIds": f"[{stateIndex+1}]",
"vizRegionRect": json.dumps({"r":"viz","x":496,"y":148,"w":0,"h":0,"fieldVector":None}),
"allowHoverActions": "false",
"allowPromptText": "true",
"allowWork": "false",
"useInlineImages": "true"
})
tooltip = json.loads(r.json()["vqlCmdResponse"]["cmdResultList"][0]["commandReturn"]["tooltipText"])["htmlTooltip"]
soup = BeautifulSoup(tooltip, "html.parser")
rows = [
t.find("tr").find_all("td")
for t in soup.find_all("table")
]
entry = { "state": state }
for row in rows:
if (row[0].text == "Mobility Index:"):
entry["CMI"] = "".join([t.text.strip() for t in row[1:]])
if row[0].text == "YoY (%):":
entry["YoY"] = "".join([t.text.strip() for t in row[1:]])
r = requests.post(f'{data_host}{tableauData["vizql_root"]}/sessions/{tableauData["sessionid"]}/commands/tabdoc/select',
data = {
"worksheet": worksheet,
"dashboard": dashboard,
"selection": json.dumps({
"objectIds":[stateIndex+1],
"selectionType":"tuples"
}),
"selectOptions": "select-options-simple"
})
entry["county_data"] = r.json()["vqlCmdResponse"]["layoutStatus"]["applicationPresModel"]["dataDictionary"]["dataSegments"]
print(entry)
data.append(entry)
print(data)
Related
Creating multiple dataframe using loop or function
I'm trying to extract the hash rate for 3 cryptocurrencies and I have attached the code for the same below. Now, I want to pass three urls and in return I need three different different dictionaries which should have the values. I'm stuck and I don't understand how should I go about it. I have tried using loops but it is not working out for me. url = {'Bitcoin' : 'https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y', 'Ethereum': 'https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y', 'Litecoin': 'https://bitinfocharts.com/comparison/litecoin-hashrate.html'} for ele in url: #### requesting the page and extracting the script which has date and values session = requests.Session() page = session.get(ele[i]) soup = BeautifulSoup(page.content, 'html.parser') values = str(soup.find_all('script')[4]) values = values.split('d = new Dygraph(document.getElementById("container"),')[1] #create an empty dict to append date and hashrates dict([("crypto_1 %s" % i,[]) for i in range(len(url))]) #run a loop over all the dates and adding to dictionary for i in range(values.count('new Date')): date = values.split('new Date("')[i+1].split('"')[0] value = values.split('"),')[i+1].split(']')[0] dict([("crypto_1 %s" % i)[date] = value
You can use next example how to get data from all 3 URLs and create a dataframe/dictionary from it: import re import requests import pandas as pd url = { "Bitcoin": "https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y", "Ethereum": "https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y", "Litecoin": "https://bitinfocharts.com/comparison/litecoin-hashrate.html", } data = [] for name, u in url.items(): html_doc = requests.get(u).text for date, hash_rate in re.findall( r'\[new Date\("(.*?)"\),(.*?)\]', html_doc ): data.append( { "Name": name, "Date": date, "Hash Rate": float("nan") if hash_rate == "null" else float(hash_rate), } ) df = pd.DataFrame(data) df["Date"] = pd.to_datetime(df["Date"]) # here save df to CSV # this will create a dictionary, where the keys are crypto names and values # are dicts with keys Date/HashRate: out = {} for name, g in df.groupby("Name"): out[name] = g[["Date", "Hash Rate"]].to_dict(orient="list") print(out) Prints: { "Bitcoin": { "Date": [ Timestamp("2009-01-03 00:00:00"), Timestamp("2009-01-04 00:00:00"), Timestamp("2009-01-05 00:00:00"), ...
Cannot getting the "href" attributes via BeautifulSoup
in short, i can't get the links of "href" attribute from this link (a turkish online book and related stuff seller). here's my code (i know it's not the best, i'm learning python for a few months online, so any heads up for best practices also welcomed) i tried to get book names, writers, prices, publishers and the links for each book; without links it's working as i expected. import requests import pandas as pd from bs4 import BeautifulSoup from time import sleep from random import randint yazar = [] fiyat = [] yayın = [] isim = [] for i in range(1,10): url = "https://www.dr.com.tr/CokSatanlar/Kitap#/page="+str(i) page = requests.get(url) soup = BeautifulSoup(page.text, "lxml") # book names k = soup.find_all("a", {"class":"prd-name"}) for i in k: isim.append(i.text) # writer names y = soup.find_all("a", {"class":"who text-overflow"}) for i in y: yazar.append(i.text) # prices f = soup.find_all("div", {"class":"prd-price"}) for i in f: fiyat.append(i.text.split()[0]) # publishers ye = soup.find_all("a", {"class":"prd-publisher"}) for i in ye: yayın.append(i.get("title")) sleep(randint(2, 4)) however when i try to get links soup.find_all("a", {"class":"prd-name"}).get("href") it turns none and i couldn't manage to make this work whatever i tried. thank you all in advance and sorry for a little longer than usual post.
The data you see on the page is loaded from external location, so you need other URL to get correct data: import requests import pandas as pd from bs4 import BeautifulSoup url = "https://www.dr.com.tr/Catalog/CatalogProducts" data = { "catalogId": "4020", "page": "1", "sortfield": "soldcount", "sortorder": "desc", "size": "60", "categoryid": "0", "parentId": "0", "mediatypes": "", "HideNotForSale": "true", "minPrice": "-1", "maxPrice": "-1", "writer": "", "minDiscount": "-1", "maxdiscount": "-1", "language": "", } all_data = [] for page in range(1, 3): # <-- increase number of pages here print(f"Getting page {page}") data["page"] = page soup = BeautifulSoup(requests.post(url, data=data).content, "html.parser") for p in soup.select(".prd-content"): all_data.append(p.get_text(strip=True, separator="|").split("|")[:5]) df = pd.DataFrame( all_data, columns=["name", "autor", "price", "type", "publisher"] ) print(df) df.to_csv("data.csv", index=False) Prints: name autor price type publisher 0 Esra Ezmeci Seti 5 Kitap Takım - Defter Hediyeli Esra Ezmeci 155,45 TL İnce Kapak Destek Yayınları 1 Şimdi Onlar Düşünsün Bircan Yıldırım 36,20 TL İnce Kapak Destek Yayınları 2 İz Bıraktığın Kadar Varsın Esra Ezmeci 36,20 TL İnce Kapak Destek Yayınları ... and saves data.csv (screenshot from Libre Office):
Think you wont get a None you will get: AttributeError: ResultSet object has no attribute 'get'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()? find_all() produces a ResultSet, so you have to iterate it to get all the href: for a in soup.find_all("a", {"class":"prd-name"}): print('https://www.dr.com.tr'+a.get("href")) Output https://www.dr.com.tr/kitap/daha-adil-bir-dunya-mumkun/arastirma-tarih/politika-arastirma/turkiye-politika-/urunno=0001934858001 https://www.dr.com.tr/kitap/burasi-cok-onemli-enerjiden-ekonomiye-tam-bagimsiz-turkiye/arastirma-tarih/politika-arastirma/turkiye-politika-/urunno=0001966362001 https://www.dr.com.tr/kitap/iz-biraktigin-kadar-varsin/egitim-basvuru/psikoloji-bilimi/urunno=0001947472001 https://www.dr.com.tr/kitap/simdi-onlar-dusunsun/bircan-yildirim/egitim-basvuru/kisisel-gelisim/urunno=0001964436001 https://www.dr.com.tr/kitap/kadinlar-sicak-erkekler-soguk-sever/esra-ezmeci/egitim-basvuru/psikoloji-bilimi/urunno=0001904239001 https://www.dr.com.tr/kitap/dustugunde-kalkarsan-hayat-guzeldir/egitim-basvuru/psikoloji-bilimi/urunno=0001816754001 ...
Find coordinates in wikipedia pages iterating over a list
Probably this is a simple question, but my experience in for loop is very limited. I was trying to adapt the solution in this page https://www.mediawiki.org/wiki/API:Geosearch with some simple examples that i have, but the result is not what i expected. For example: I have this simple data frame: df= pd.DataFrame({'City':['Sesimbra','Ciudad Juárez','31100 Treviso','Ramada Portugal','Olhão'], 'Country':['Portugal','México','Itália','Portugal','Portugal']}) I created a list based on cities: lista_cidades = list(df['City']) and i would like to iterate over this list to get the coordinates (decimal, preferably) So far i tried this approach: import requests lng_dict = {} lat_dict = {} S = requests.Session() URL = "https://en.wikipedia.org/w/api.php" PARAMS = { "action": "query", "format": "json", "titles": [lista_cidades], "prop": "coordinates" } R = S.get(url=URL, params=PARAMS) DATA = R.json() PAGES = DATA['query']['pages'] for i in range(len(lista_cidades)): for k, v in PAGES.items(): try: lat_dict[lista_cidades[i]] = str(v['coordinates'][0]['lat']) lng_dict[lista_cidades[i]] = str(v['coordinates'][0]['lon']) except: pass but it looks like the code doesn't iterate over the list and always returns the same coordinate For example, when i call the dictionary with latitude coordinates, this is what i get lng_dict {'Sesimbra': '-7.84166667', 'Ciudad Juárez': '-7.84166667', '31100 Treviso': '-7.84166667', 'Ramada Portugal': '-7.84166667', 'Olhão': '-7.84166667'} What should i do to solve this? Thanks in advance
I think the query returns only one result, it will take only the last city from you list (in your cas the "Olhão" coordinates). You can check it by logging the DATA content. I do not know about wikipedia API, but either your call lack a parameter (documentation should give you the information) or you have to call the API for each city like : import pandas as pd import requests df = pd.DataFrame({'City': ['Sesimbra', 'Ciudad Juárez', '31100 Treviso', 'Ramada Portugal', 'Olhão'], 'Country': ['Portugal', 'México', 'Itália', 'Portugal', 'Portugal']}) lista_cidades = list(df['City']) lng_dict = {} lat_dict = {} S = requests.Session() URL = "https://en.wikipedia.org/w/api.php" for city in lista_cidades: PARAMS = { "action": "query", "format": "json", "titles": city, "prop": "coordinates" } R = S.get(url=URL, params=PARAMS) DATA = R.json() PAGES = DATA['query']['pages'] for k, v in PAGES.items(): try: lat_dict[city] = str(v['coordinates'][0]['lat']) lng_dict[city] = str(v['coordinates'][0]['lon']) except: pass
Scrape eBay Sold Items Using Selenium Returns []
I have almost no webscraping experience, and wasn't able to solve this using BeautifulSoup, so I'm trying selenium (installed it today). I'm trying to scrape sold items on eBay. I'm trying to scrape: https://www.ebay.com/sch/i.html?_from=R40&_nkw=oakley+sunglasses&_sacat=0&Brand=Oakley&rt=nc&LH_Sold=1&LH_Complete=1&_ipg=200&_oaa=1&_fsrp=1&_dcat=79720 Here is my code where I load in html code and convert to selenium html: ebay_url = 'https://www.ebay.com/sch/i.html?_from=R40&_nkw=oakley+sunglasses&_sacat=0&Brand=Oakley&rt=nc&LH_Sold=1&LH_Complete=1&_ipg=200&_oaa=1&_fsrp=1&_dcat=79720' html = requests.get(ebay_url) #print(html.text) driver = wd.Chrome(executable_path=r'/Users/mburley/Downloads/chromedriver') driver.get(ebay_url) Which correctly opens a new chrome session at the correct url. I'm working on getting the titles, prices, and date sold and then loading it into a csv file. Here is the code I have for those: # Find all div tags and set equal to main_data all_items = driver.find_elements_by_class_name("s-item__info clearfix")[1:] #print(main_data) # Loop over main_data to extract div classes for title, price, and date for item in all_items: date = item.find_element_by_xpath("//span[contains(#class, 'POSITIVE']").text.strip() title = item.find_element_by_xpath("//h3[contains(#class, 's-item__title s-item__title--has-tags']").text.strip() price = item.find_element_by_xpath("//span[contains(#class, 's-item__price']").text.strip() print('title:', title) print('price:', price) print('date:', date) print('---') data.append( [title, price, date] ) Which just returns []. I think ebay may be blocking my IP, but the html code loads in and looks correct. Hopefully someone can help! Thanks!
It is not necessary to use Selenium for eBay scraping, as the data is not rendered by JavaScript thus can be extracted from plain HTML. It is enough to use BeautifulSoup web scraping library. Keep in mind that problems with site parsing may arise when you try to request a site multiple times. eBay may consider that this is a bot that sends a request (not a real user). To avoid this, one of the ways is to send headers that contain user-agent in the request, then the site will assume that you're a user and display information. As an additional step is to rotate those user-agents. The ideal scenario is to use proxies in combo with rotated user-agents (besides CAPTCHA solver) from bs4 import BeautifulSoup import requests, json, lxml # https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36" } params = { '_nkw': 'oakley+sunglasses', # search query 'LH_Sold': '1', # shows sold items '_pgn': 1 # page number } data = [] while True: page = requests.get('https://www.ebay.com/sch/i.html', params=params, headers=headers, timeout=30) soup = BeautifulSoup(page.text, 'lxml') print(f"Extracting page: {params['_pgn']}") print("-" * 10) for products in soup.select(".s-item__info"): title = products.select_one(".s-item__title span").text price = products.select_one(".s-item__price").text link = products.select_one(".s-item__link")["href"] data.append({ "title" : title, "price" : price, "link" : link }) if soup.select_one(".pagination__next"): params['_pgn'] += 1 else: break print(json.dumps(data, indent=2, ensure_ascii=False) Example output Extracting page: 1 ---------- [ { "title": "Shop on eBay", "price": "$20.00", "link": "https://ebay.com/itm/123456?hash=item28caef0a3a:g:E3kAAOSwlGJiMikD&amdata=enc%3AAQAHAAAAsJoWXGf0hxNZspTmhb8%2FTJCCurAWCHuXJ2Xi3S9cwXL6BX04zSEiVaDMCvsUbApftgXEAHGJU1ZGugZO%2FnW1U7Gb6vgoL%2BmXlqCbLkwoZfF3AUAK8YvJ5B4%2BnhFA7ID4dxpYs4jjExEnN5SR2g1mQe7QtLkmGt%2FZ%2FbH2W62cXPuKbf550ExbnBPO2QJyZTXYCuw5KVkMdFMDuoB4p3FwJKcSPzez5kyQyVjyiIq6PB2q%7Ctkp%3ABlBMULq7kqyXYA" }, { "title": "Oakley X-metal Juliet Men's Sunglasses", "price": "$280.00", "link": "https://www.ebay.com/itm/265930582326?hash=item3deab2a936:g:t8gAAOSwMNhjRUuB&amdata=enc%3AAQAHAAAAoH76tlPncyxembf4SBvTKma1pJ4vg6QbKr21OxkL7NXZ5kAr7UvYLl2VoCPRA8KTqOumC%2Bl5RsaIpJgN2o2OlI7vfEclGr5Jc2zyO0JkAZ2Gftd7a4s11rVSnktOieITkfiM3JLXJM6QNTvokLclO6jnS%2FectMhVc91CSgZQ7rc%2BFGDjXhGyqq8A%2FoEyw4x1Bwl2sP0viGyBAL81D2LfE8E%3D%7Ctkp%3ABk9SR8yw1LH9YA" }, { "title": " Used Oakley PROBATION Sunglasses Polished Gold/Dark Grey (OO4041-03)", "price": "$120.00", "link": "https://www.ebay.com/itm/334596701765?hash=item4de7847e45:g:d5UAAOSw4YtjTfEE&amdata=enc%3AAQAHAAAAoItMbbzfQ74gNUiinmOVnzKlPWE%2Fc54B%2BS1%2BrZpy6vm5lB%2Bhvm5H43UFR0zeCU0Up6sPU2Wl6O6WR0x9FPv5Y1wYKTeUbpct5vFKu8OKFBLRT7Umt0yxmtLLMWaVlgKf7StwtK6lQ961Y33rf3YuQyp7MG7H%2Fa9fwSflpbJnE4A9rLqvf3hccR9tlWzKLMj9ZKbGxWT17%2BjyUp19XIvX2ZI%3D%7Ctkp%3ABk9SR8yw1LH9YA" }, As an alternative, you can use Ebay Organic Results API from SerpApi. It`s a paid API with a free plan that handles blocks and parsing on their backend. Example code that paginates through all pages: from serpapi import EbaySearch import os, json params = { "api_key": os.getenv("API_KEY"), # serpapi api key "engine": "ebay", # search engine "ebay_domain": "ebay.com", # ebay domain "_nkw": "oakley+sunglasses", # search query "_pgn": 1, # page number "LH_Sold": "1" # shows sold items } search = EbaySearch(params) # where data extraction happens page_num = 0 data = [] while True: results = search.get_dict() # JSON -> Python dict if "error" in results: print(results["error"]) break for organic_result in results.get("organic_results", []): link = organic_result.get("link") price = organic_result.get("price") data.append({ "price" : price, "link" : link }) page_num += 1 print(page_num) if "next" in results.get("pagination", {}): params['_pgn'] += 1 else: break print(json.dumps(data, indent=2)) Output: [ { "price": { "raw": "$68.96", "extracted": 68.96 }, "link": "https://www.ebay.com/itm/125360598217?epid=20030526224&hash=item1d3012ecc9:g:478AAOSwCt5iqgG5&amdata=enc%3AAQAHAAAA4Ls3N%2FEH5OR6w3uoTlsxUlEsl0J%2B1aYmOoV6qsUxRO1d1w3twg6LrBbUl%2FCrSTxNOjnDgIh8DSI67n%2BJe%2F8c3GMUrIFpJ5lofIRdEmchFDmsd2I3tnbJEqZjIkWX6wXMnNbPiBEM8%2FML4ljppkSl4yfUZSV%2BYXTffSlCItT%2B7ZhM1fDttRxq5MffSRBAhuaG0tA7Dh69ZPxV8%2Bu1HuM0jDQjjC4g17I3Bjg6J3daC4ZuK%2FNNFlCLHv97w2fW8tMaPl8vANMw8OUJa5z2Eclh99WUBvAyAuy10uEtB3NDwiMV%7Ctkp%3ABk9SR5DKgLD9YA" }, { "price": { "raw": "$62.95", "extracted": 62.95 }, "link": "https://www.ebay.com/itm/125368283608?epid=1567457519&hash=item1d308831d8:g:rnsAAOSw7PJiqMQz&amdata=enc%3AAQAHAAAA4AwZhKJZfTqrG8VskZL8rtfsuNtZrMdWYpndpFs%2FhfrIOV%2FAjLuzNzaMNIvTa%2B6QUTdkOwTLRun8n43cZizqtOulsoBLQIwy3wf19N0sHxGF5HaIDOBeW%2B2sobRnzGdX%2Fsmgz1PRiKFZi%2BUxaLQpWCoGBf9n8mjcsFXi3esxbmAZ8kenO%2BARbRBzA2Honzaleb2tyH5Tf8%2Bs%2Fm5goqbon%2FcEsR0URO7BROkBUUjDCdDH6fFi99m6anNMMC3yTBpzypaFWio0u2qu5TgjABUfO1wzxb4ofA56BNKjoxttb7E%2F%7Ctkp%3ABk9SR5DKgLD9YA" }, # ... ] Disclaimer, I work for SerpApi.
You can use the below code to scrape the details. also you can use pandas to store data in csv file. Code : ebay_url = 'https://www.ebay.com/sch/i.html?_from=R40&_nkw=oakley+sunglasses&_sacat=0&Brand=Oakley&rt=nc&LH_Sold=1&LH_Complete=1&_ipg=200&_oaa=1&_fsrp=1&_dcat=79720' html = requests.get(ebay_url) # print(html.text) driver = wd.Chrome(executable_path=r'/Users/mburley/Downloads/chromedriver') driver.maximize_window() driver.implicitly_wait(30) driver.get(ebay_url) wait = WebDriverWait(driver, 20) sold_date = [] title = [] price = [] i = 1 for item in driver.find_elements(By.XPATH, "//div[contains(#class,'title--tagblock')]/span[#class='POSITIVE']"): sold_date.append(item.text) title.append(driver.find_element_by_xpath(f"(//div[contains(#class,'title--tagblock')]/span[#class='POSITIVE']/ancestor::div[contains(#class,'tag')]/following-sibling::a/h3)[{i}]").text) price.append(item.find_element_by_xpath(f"(//div[contains(#class,'title--tagblock')]/span[#class='POSITIVE']/ancestor::div[contains(#class,'tag')]/following-sibling::div[contains(#class,'details')]/descendant::span[#class='POSITIVE'])[{i}]").text) i = i + 1 print(sold_date) print(title) print(price) data = { 'Sold_date': sold_date, 'title': title, 'price': price } df = pd.DataFrame.from_dict(data) df.to_csv('out.csv', index = 0) Imports : import pandas as pd from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By
Scrape Tables on Multiple Pages with Single URL
I am trying to scrape data from Fangraphs. The tables are split into 21 pages but all of the pages use the same url. I am very new to webscraping (or python in general), but Fangraphs does not have a public API so scraping the page seems to be my only option. I am currently using BeautifulSoup to parse the HTML code and I am able to scrape the initial table, but that only contains the first 30 players, but I want the entire player pool. Two days of web searching and I am stuck. Link and my current code are below. I know they have a link to download the csv file, but that gets tedious through out the season and I would like expedite the data harvesting process. Any direction would be helpful, thank you. https://www.fangraphs.com/projections.aspx?pos=all&stats=bat&type=fangraphsdc import requests import pandas as pd url = 'https://www.fangraphs.com/projections.aspx?pos=all&stats=bat&type=fangraphsdc&team=0&lg=all&players=0' response = requests.get(url, verify=False) # Use BeautifulSoup to parse the HTML code soup = BeautifulSoup(response.content, 'html.parser') # changes stat_table from ResultSet to a Tag stat_table = stat_table[0] # Convert html table to list rows = [] for tr in stat_table.find_all('tr')[1:]: cells = [] tds = tr.find_all('td') if len(tds) == 0: ths = tr.find_all('th') for th in ths: cells.append(th.text.strip()) else: for td in tds: cells.append(td.text.strip()) rows.append(cells) # convert table to df table = pd.DataFrame(rows)
import requests from bs4 import BeautifulSoup import pandas as pd params = { "pos": "all", "stats": "bat", "type": "fangraphsdc" } data = { 'RadScriptManager1_TSM': 'ProjectionBoard1$dg1', "__EVENTTARGET": "ProjectionBoard1$dg1", '__EVENTARGUMENT': 'FireCommand:ProjectionBoard1$dg1$ctl00;PageSize;1000', '__VIEWSTATEGENERATOR': 'C239D6F0', '__SCROLLPOSITIONX': '0', '__SCROLLPOSITIONY': '1366', "ProjectionBoard1_tsStats_ClientState": "{\"selectedIndexes\":[\"0\"],\"logEntries\":[],\"scrollState\":{}}", "ProjectionBoard1_tsPosition_ClientState": "{\"selectedIndexes\":[\"0\"],\"logEntries\":[],\"scrollState\":{}}", "ProjectionBoard1$rcbTeam": "All+Teams", "ProjectionBoard1_rcbTeam_ClientState": "", "ProjectionBoard1$rcbLeague": "All", "ProjectionBoard1_rcbLeague_ClientState": "", "ProjectionBoard1_tsProj_ClientState": "{\"selectedIndexes\":[\"5\"],\"logEntries\":[],\"scrollState\":{}}", "ProjectionBoard1_tsUpdate_ClientState": "{\"selectedIndexes\":[],\"logEntries\":[],\"scrollState\":{}}", "ProjectionBoard1$dg1$ctl00$ctl02$ctl00$PageSizeComboBox": "30", "ProjectionBoard1_dg1_ctl00_ctl02_ctl00_PageSizeComboBox_ClientState": "", "ProjectionBoard1$dg1$ctl00$ctl03$ctl01$PageSizeComboBox": "1000", "ProjectionBoard1_dg1_ctl00_ctl03_ctl01_PageSizeComboBox_ClientState": "{\"logEntries\":[],\"value\":\"1000\",\"text\":\"1000\",\"enabled\":true,\"checkedIndices\":[],\"checkedItemsTextOverflows\":false}", "ProjectionBoard1_dg1_ClientState": "" } def main(url): with requests.Session() as req: r = req.get(url, params=params) soup = BeautifulSoup(r.content, 'html.parser') data['__VIEWSTATE'] = soup.find("input", id="__VIEWSTATE").get("value") data['__EVENTVALIDATION'] = soup.find( "input", id="__EVENTVALIDATION").get("value") r = req.post(url, params=params, data=data) df = pd.read_html(r.content, attrs={ 'id': 'ProjectionBoard1_dg1_ctl00'})[0] df.drop(df.columns[1], axis=1, inplace=True) print(df) df.to_csv("data.csv", index=False) main("https://www.fangraphs.com/projections.aspx") Output: view-online