Why is this web scrape not working on python? - python

I haven’t recently been using the code attached. For the past few weeks, it has been working completely fine and always produced results. However, I used this today and for some reason it didn’t work. Could you please help and provide a solution to the problem.
import requests, json
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {"q": "dji", "hl": "en", 'gl': 'us', 'tbm': 'shop'}
response = requests.get("https://www.google.com/search",
params=params,
headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
# list with two dict() combined
shopping_data = []
shopping_results_dict = {}
for shopping_result in soup.select('.sh-dgr__content'):
title = shopping_result.select_one('.Lq5OHe.eaGTj h4').text
product_link = f"https://www.google.com{shopping_result.select_one('.Lq5OHe.eaGTj')['href']}"
source = shopping_result.select_one('.IuHnof').text
price = shopping_result.select_one('span.kHxwFf span').text
try:
rating = shopping_result.select_one('.Rsc7Yb').text
except:
rating = None
try:
reviews = shopping_result.select_one('.Rsc7Yb').next_sibling.next_sibling
except:
reviews = None
try:
delivery = shopping_result.select_one('.vEjMR').text
except:
delivery = None
shopping_results_dict.update({
'shopping_results': [{
'title': title,
'link': product_link,
'source': source,
'price': price,
'rating': rating,
'reviews': reviews,
'delivery': delivery,
}]
})
shopping_data.append(dict(shopping_results_dict))
print(title)

Because .select in for shopping_result in soup.select('.sh-dgr__content'): could not find any element so it gives you an empty list. Therefor the body of the for-loop is not executed. Python jumps out of the loop.
title only exists and is defined when the body of the for loop executes.
You should make sure you used a correct method to find your element(s).

Related

Unable to query graphql with a sha256 hash to scrape property links from a webpage

After visiting this website, when I fill out the inputbox with Sydney CBD, NSW and hit the search button, I can see the required results displayed on that site.
I wish to scrape the property links using requests module. When I go for the following attempt, I can get the property links from the first page.
The problem here is that I hardcoded the value of sha256Hash within params, which is not what I want to do. I don't know if the ID retrieved by issuing a get requests to the suggestion url needs to be converted to sha256Hash.
However, when I do that using this function get_hashed_string(), the value it produces is different from the hardcoded one that is available within params. As a result, the script spits out a keyError on this line: container = res.json().
import requests
import hashlib
from pprint import pprint
from bs4 import BeautifulSoup
url = 'https://suggest.realestate.com.au/consumer-suggest/suggestions'
link = 'https://lexa.realestate.com.au/graphql'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
payload = {
'max': '7',
'type': 'suburb,region,precinct,state,postcode',
'src': 'homepage-web',
'query': 'Sydney CBD, NSW'
}
params = {"operationName":"searchByQuery","variables":{"query":"{\"channel\":\"buy\",\"page\":1,\"pageSize\":25,\"filters\":{\"surroundingSuburbs\":true,\"excludeNoSalePrice\":false,\"ex-under-contract\":false,\"ex-deposit-taken\":false,\"excludeAuctions\":false,\"excludePrivateSales\":false,\"furnished\":false,\"petsAllowed\":false,\"hasScheduledAuction\":false},\"localities\":[{\"searchLocation\":\"sydney cbd, nsw\"}]}","testListings":False,"nullifyOptionals":False},"extensions":{"persistedQuery":{"version":1,"sha256Hash":"ef58e42a4bd826a761f2092d573ee0fb1dac5a70cd0ce71abfffbf349b5b89c1"}}}
def get_hashed_string(keyword):
hashed_str = hashlib.sha256(keyword.encode('utf-8')).hexdigest()
return hashed_str
with requests.Session() as s:
s.headers.update(headers)
r = s.get(url,params=payload)
hashed_id = r.json()['_embedded']['suggestions'][0]['id']
# params['extensions']['persistedQuery']['sha256Hash'] = get_hashed_string(hashed_id)
res = s.post(link,json=params)
container = res.json()['data']['buySearch']['results']['exact']['items']
for item in container:
print(item['listing']['_links']['canonical']['href'])
If I run the script as is, it works beautifully. When I uncomment the line params['extensions']['persistedQuery']--> and run the script again, the script breaks.
How can I generate the value of sha256Hash and use the same within the script above?
This is not how graphql works. The sha value stays the same across all requests but what you're missing is a valid graphql query.
You have to reconstruct that first and then just use the API pagination - that's the key.
Here's how:
import json
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/109.0",
"Accept": "application/graphql+json, application/json",
"Content-Type": "application/json",
"Host": "lexa.realestate.com.au",
"Referer": "https://www.realestate.com.au/",
}
endpoint = "https://lexa.realestate.com.au/graphql"
graph_query = "{\"channel\":\"buy\",\"page\":page_number,\"pageSize\":25,\"filters\":{\"surroundingSuburbs\":true," \
"\"excludeNoSalePrice\":false,\"ex-under-contract\":false,\"ex-deposit-taken\":false," \
"\"excludeAuctions\":false,\"excludePrivateSales\":false,\"furnished\":false,\"petsAllowed\":false," \
"\"hasScheduledAuction\":false},\"localities\":[{\"searchLocation\":\"sydney cbd, nsw\"}]}"
graph_json = {
"operationName": "searchByQuery",
"variables": {
"query": "",
"testListings": False,
"nullifyOptionals": False
},
"extensions": {
"persistedQuery": {
"version": 1,
"sha256Hash": "ef58e42a4bd826a761f2092d573ee0fb1dac5a70cd0ce71abfffbf349b5b89c1"
}
}
}
if __name__ == '__main__':
with requests.Session() as s:
for page in range(1, 3):
graph_json['variables']['query'] = graph_query.replace('page_number', str(page))
r = s.post(endpoint, headers=headers, data=json.dumps(graph_json))
listing = r.json()['data']['buySearch']['results']['exact']['items']
for item in listing:
print(item['listing']['_links']['canonical']['href'])
This should give you:
https://www.realestate.com.au/property-apartment-nsw-sydney-140558991
https://www.realestate.com.au/property-apartment-nsw-sydney-141380404
https://www.realestate.com.au/property-apartment-nsw-sydney-140310979
https://www.realestate.com.au/property-apartment-nsw-sydney-141259592
https://www.realestate.com.au/property-apartment-nsw-barangaroo-140555291
https://www.realestate.com.au/property-apartment-nsw-sydney-140554403
https://www.realestate.com.au/property-apartment-nsw-millers+point-141245584
https://www.realestate.com.au/property-apartment-nsw-haymarket-139205259
https://www.realestate.com.au/project/hyde-metropolitan-by-deicorp-sydney-600036803
https://www.realestate.com.au/property-apartment-nsw-haymarket-140807411
https://www.realestate.com.au/property-apartment-nsw-sydney-141370756
https://www.realestate.com.au/property-apartment-nsw-sydney-141370364
https://www.realestate.com.au/property-apartment-nsw-haymarket-140425111
https://www.realestate.com.au/project/greenland-centre-sydney-600028910
https://www.realestate.com.au/property-apartment-nsw-sydney-141364136
https://www.realestate.com.au/property-apartment-nsw-sydney-139367203
https://www.realestate.com.au/property-apartment-nsw-sydney-141156696
https://www.realestate.com.au/property-apartment-nsw-sydney-141362880
https://www.realestate.com.au/property-studio-nsw-sydney-141311384
https://www.realestate.com.au/property-apartment-nsw-haymarket-141354876
https://www.realestate.com.au/property-apartment-nsw-the+rocks-140413283
https://www.realestate.com.au/property-apartment-nsw-sydney-141350552
https://www.realestate.com.au/property-apartment-nsw-sydney-140657935
https://www.realestate.com.au/property-apartment-nsw-barangaroo-139149039
https://www.realestate.com.au/property-apartment-nsw-haymarket-141034784
https://www.realestate.com.au/property-apartment-nsw-sydney-141230640
https://www.realestate.com.au/property-apartment-nsw-barangaroo-141340768
https://www.realestate.com.au/property-apartment-nsw-haymarket-141337684
https://www.realestate.com.au/property-unitblock-nsw-millers+point-141337528
https://www.realestate.com.au/property-apartment-nsw-sydney-141028828
https://www.realestate.com.au/property-apartment-nsw-sydney-141223160
https://www.realestate.com.au/property-apartment-nsw-sydney-140643067
https://www.realestate.com.au/property-apartment-nsw-sydney-140768179
https://www.realestate.com.au/property-apartment-nsw-haymarket-139406051
https://www.realestate.com.au/property-apartment-nsw-haymarket-139406047
https://www.realestate.com.au/property-apartment-nsw-sydney-139652067
https://www.realestate.com.au/property-apartment-nsw-sydney-140032667
https://www.realestate.com.au/property-apartment-nsw-sydney-127711002
https://www.realestate.com.au/property-apartment-nsw-sydney-140903924
https://www.realestate.com.au/property-apartment-nsw-walsh+bay-139130519
https://www.realestate.com.au/property-apartment-nsw-sydney-140285823
https://www.realestate.com.au/property-apartment-nsw-sydney-140761223
https://www.realestate.com.au/project/111-castlereagh-sydney-600031082
https://www.realestate.com.au/property-apartment-nsw-sydney-140633099
https://www.realestate.com.au/property-apartment-nsw-haymarket-141102892
https://www.realestate.com.au/property-apartment-nsw-sydney-139522379
https://www.realestate.com.au/property-apartment-nsw-sydney-139521259
https://www.realestate.com.au/property-apartment-nsw-sydney-139521219
https://www.realestate.com.au/property-apartment-nsw-haymarket-140007279
https://www.realestate.com.au/property-apartment-nsw-haymarket-139156515

Not able to extract data or find problem with code

Below is the code for extracting news articles for different companies from Google news. This gives me an empty excel file with only the headers. I am not able to figure out what the error is. Can someone please help out. (The entire code can be copy-pasted, and reproduced in your machine):
import requests
import random
from collections import OrderedDict
def list_header():
headers_list = [
# Firefox 24 Linux
{
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
},
# Firefox Mac
{
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}]
return headers_list
def list_dict():
# Get headers list
headers_list = list_header()
# Create ordered dict from Headers above
ordered_headers_list = []
for headers in headers_list:
h = OrderedDict()
for header,value in headers.items():
h[header]=value
ordered_headers_list.append(h)
return ordered_headers_list
def list_test():
headers_list = list_dict()
max = len(headers_list)
url = 'https://httpbin.org/headers'
for i in range(0,max):
#Pick a random browser headers
headers = random.choice(headers_list)
#Create a request session
r = requests.Session()
r.headers = headers
response = r.get(url)
print("Request #%d\nUser-Agent Sent:%s\n\nHeaders Recevied by HTTPBin:"%(i,headers['User-Agent']))
print(response.json())
print("-------------------")
def random_header():
headers_list = list_dict()
headers = random.choice(headers_list)
return headers
import pandas as pd
def ingest_google_news():
ticker_list = ['AAPL.O', 'MSFT', 'GOOG', '2222.SR', 'AMZN', 'FB', 'TSLA', 'BRK-A', 'TCEHY', 'TSM', 'NVDA', 'JPM', 'BABA', 'V', 'JNJ', '005930.KS', 'WMT', 'LVMUY']
sep = '.'
df = pd.DataFrame()
t_news = []
t_publisher = []
t_urls = []
t_dates = []
t_tickers = []
for t in ticker_list:
news = []
publisher = []
urls = []
dates = []
tickers = []
# cleaning ticker
ticker = t
t = t.split(sep, 1)[0]
# set header by random user agent
r = requests.Session()
headers = random_header()
r.headers = headers
# print(headers)
# set query for google
query = '{} news'.format(t)
url = f"https://www.google.com/search?q={query}&tbm=nws&lr=lang_en&hl=en&sort=date&num=5"
res = r.get(url, headers=headers)
soup = bs4.BeautifulSoup(res.text, "html.parser")
links = soup.select(".dbsr a")
for l in links:
tickers.append(t)
try:
url_w = l.get("href")
print(url_w)
urls.append(url_w)
dt = find_date(url_w)
dates.append(dt)
res = requests.get(url_w, headers=headers)
parsed_article = bs4.BeautifulSoup(res.text,'lxml')
paragraphs = parsed_article.find_all('p')
article_text = ""
for p in paragraphs:
article_text += p.text
except Exception as e:
article_text = ''
news.append(article_text)
sources = soup.select(".XTjFC g-img")
for s in sources:
publisher.append(s.next_sibling.lower())
t_urls += urls
t_news += news
t_publisher += publisher
t_dates += dates
t_tickers += tickers
df['ticker'] = t_tickers
df['links'] = t_urls
df['article_text'] = t_news
df['publisher'] = t_publisher
df['created_at'] = t_dates
# import to csv
today = datetime.date.today()
d1 = today.strftime("%d%m%Y")
df.to_csv(f'/content/drive/MyDrive/google_news_{d1}.csv')
del news, publisher, urls, dates, tickers
del t_news, t_publisher, t_urls, t_dates, t_tickers
import bs4
from bs4 import BeautifulSoup
import datetime
ingest_google_news()
The code above is from the following link: https://medium.com/analytics-vidhya/google-scraping-using-beautifulsoup-d53746ef5a32
Have a look at the SelectorGadget Chrome extension to grab CSS selectors by clicking on the desired element in your browser. CSS selectors reference.
Code and example in the online IDE (extracts the title, link, snippet, date published, source, and stores to CSV):
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
ticker_list = ['AAPL.O', 'LVS', 'COTY.K','JPM', 'XOM', '005930.KS']
def get_news():
# iterate over each ticker
for news in ticker_list:
params = {
"q": news, # query
"hl": "en", # language
"gl": "us", # country to search from
"tbm": "nws", # google news filter
}
# store news data
news_data = []
html = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
# container with needed data
for result in soup.select('.WlydOe'):
title = result.select_one('.nDgy9d').text
link = result['href']
source = result.select_one('.CEMjEf span').text
snippet = result.select_one('.GI74Re').text
date_published = result.select_one('p.S1FAPd').text
print(f'{title}\n{link}\n{snippet}\n{date_published}\n{source}\n')
# append news data to list in dict() format to save it later via pandas
news_data.append({
'title': title,
'link': link,
'snippet': snippet,
'date_published': date_published,
'source': source
})
# create DataFrame from the list()
df = pd.DataFrame(news_data)
# save DataFrame to csv without default index column on the left side
df.to_csv('bs4_final.csv', index=False)
get_news()
----------
'''
EU plans to legislate for common phone charger despite ...
https://www.reuters.com/technology/eu-plans-legislate-common-phone-charger-despite-apple-grumbles-2021-09-21/
... a person familiar with the matter said - a move likely to affect iPhone
maker Apple (AAPL.O) more than its rivals.
2 hours ago
Reuters
Wall Street ends sharply lower in broad sell-off
https://www.reuters.com/business/wall-street-ends-sharply-lower-broad-sell-off-2021-09-20/
O), Amazon.com Inc (AMZN.O), Apple Inc (AAPL.O), Facebook Inc (FB. ... O)
were among the biggest drags on the index as well as the S&P 500.
18 hours ago
Reuters
... other results
'''
Alternatively, you can achieve the same thing by using Google News Results API from SerpApi. It's a paid API with a free plan.
The main difference in your example is that you don't need to make things that complicated, figure out why things just don't work as expected, and then maintain it over time. Instead, you only need to iterate over structured JSON and get what you want, fast.
Code to integrate (same process for saving to csv as with code above):
import os
from serpapi import GoogleSearch
ticker_list = ['AAPL.O', 'LVS', 'COTY.K','JPM', 'XOM', '005930.KS']
def get_news():
for news in ticker_list:
params = {
"engine": "google",
"q": news,
"gl": "us",
"tbm": "nws",
"api_key": os.getenv("API_KEY"), # API environment variable
}
search = GoogleSearch(params)
results = search.get_dict()
for news_result in results["news_results"]:
print(f"Title: {news_result['title']}\nLink: {news_result['link']}\n")
get_news()
-----------
'''
Title: EU plans to legislate for common phone charger despite ...
Link: https://www.reuters.com/technology/eu-plans-legislate-common-phone-charger-despite-apple-grumbles-2021-09-21/
Title: Wall Street ends sharply lower in broad sell-off
Link: https://www.reuters.com/business/wall-street-ends-sharply-lower-broad-sell-off-2021-09-20/
Title: S&P 500 down more than 2% as growth worries rise
Link: https://www.reuters.com/business/sp-500-down-more-than-2-growth-worries-rise-2021-09-20/
Title: Apple joins streaming elite, Netflix crosses milestone with ...
Link: https://www.reuters.com/technology/apple-joins-streaming-elite-netflix-crosses-milestone-with-emmy-wins-2021-09-20/
... other results
'''
Disclaimer, I work for SerpApi.

Web scraper returning zero in terminal

I'm trying to scrape this website
https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who=bygg&bf=1&page=1
And I've put a def getQuestions(tag) in the who={tag} part of the url and that works fine. When I try to add def getQuestions(tag, page) page={page} it just returns 0 in the terminal, and I really hope no clue what could be causing this.
Here is the full code:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'}
questionlist = []
def getQuestions(tag, page):
url = 'https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={bygg}&bf=1&page={page}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
questions = soup.find_all('div', {'class': 'box-white p-0 mb-4'})
for item in questions:
question = {
'title': item.find('a', {'class': 'link-primary'}).text,
'link': item.find('a', {'class': 'link-primary'})['href'],
'nummer': item.find('a', {'class': 'link-body'})['href'],
'address': item.find('address', {'class': 'mt-2 mb-0'}).text,
'RegÅr': item.find('div', {'class': 'col text-center'}).text,
}
questionlist.append(question)
return
for x in range(1,5):
getQuestions('bygg', x)
print(len(questionlist))
Any help would be appreciated. Best regards!
Change the string in url variable to f-string:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
}
def getQuestions(tag, page):
questionlist = []
url = f"https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={tag}&bf=1&page={page}"
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
questions = soup.find_all("div", {"class": "box-white p-0 mb-4"})
for item in questions:
question = {
"title": item.find("a", {"class": "link-primary"}).text,
"link": item.find("a", {"class": "link-primary"})["href"],
"nummer": item.find("a", {"class": "link-body"})["href"],
"address": item.find("address", {"class": "mt-2 mb-0"}).text,
"RegÅr": item.find("div", {"class": "col text-center"}).text,
}
questionlist.append(question)
return questionlist
out = []
for x in range(1, 5):
out.extend(getQuestions("bygg", x))
print(len(out))
Prints:
80
Try changing your url to this:
url = f'https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={tag}&bf=1&page={page}'
You didn't quite have your f-Strings set up right

Pick only one number from an html page with beatifulsoup

I have this url from coronavirus worldwide and I would like to pick only one number, the newcases in Arizona which is +2383 right now.
import requests
from bs4 import BeautifulSoup
import lxml
url = "https://www.worldmeter.com/coronavirus/us/"
page = requests.get("https://www.worldmeter.com/coronavirus/us/")
soup = BeautifulSoup(page.content, "lxml")
page.close()
newcases = soup.find('a', href_="https://worldmeter.com/coronavirus/arizona", class_="tableRowLinkYellow newCasesStates").get_text(strip=True)
print(newcases)
I get this error:
AttributeError: 'NoneType' object has no attribute 'get_text'
How do I pick only that number from the whole table? Thank you for your time.
Just like Linh said, it was generated by Javascript.Using selenium is an easy way but not efficient enough.(too slow)
You could scrape the API directly:
import requests
url = "https://worldmeter.com/coronavirus/wp-admin/admin-ajax.php?action=wp_ajax_ninja_tables_public_action&table_id=2582&target_action=get-all-data&default_sorting=old_first"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
}
results = requests.get(url, headers=headers).json()
for result in results:
if result["state_name"] == "Arizona":
print(result)
print("The newcases is", result["new_cases"])
And this gave me:
{'state_name': 'Arizona', 'positive': '275,436', 'new_cases': '2,383', 'death_in_states': '6,302', 'new_deaths': '2', 'recovered_states': '45,400', 'new_recovered': '364', 'totaltestresults': 'Arizona', 'postname': 'arizona', 'cases_100_k_population': '3,866.37', 'state_population': '7278717', 'death_100_k_population': '88.46'}
The newcases is 2,383

Href attribute not showing for <a> tag when using requests.post

I am trying to download the .csv file that appears on this page when I submit the data as
Data for : Security-wise Price volume & Deliverable position data
Symbol : 3INFOTECH
Select Series : All
Period : 24 months
My code is
symbol = "3IINFOTECH"
url = "https://www.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp"
data = {
"dataType":"priceVolumeDeliverable",
"symbol":symbol,
"segmentLink":"3",
"symbolCount":"2",
"series":"ALL",
"rdPeriod":"groupPeriod",
"dateRange":"24month"
}
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
}
print("fetching for " + symbol)
session = requests.session()
response = requests.post(url, data, headers = headers)
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
download_link = soup.findAll("span", attrs = {"class":"download-data-link"})[0]
print(download_link.a["href"])
Now on inspecting element I see this
How do I download the csv file? The post request from my code does not show me the href attribute.
To get the link you have to click the button, so you could use selenium or something equivalent but it is very easy to just parse the data yourself as all you get back from the post request is the data:
symbol = "3IINFOTECH"
url = "https://www.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp"
data = {
"dataType": "priceVolumeDeliverable",
"symbol": symbol,
"segmentLink": "3",
"symbolCount": "2",
"series": "ALL",
"rdPeriod": "groupPeriod",
"dateRange": "24month"
}
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
}
print("fetching for " + symbol)
import csv
response = requests.post(url, data, headers=headers)
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
cols = [th.text for th in soup.select("th")]
rows = ([td.text for td in row.select("td")] for row in soup.select("tr + tr"))
with open("data.csv", "w") as f:
wr = csv.writer(f)
wr.writerow(cols)
wr.writerows(rows)
A snippet of data.csv:
Symbol,Series,Date,Prev Close,Open Price,High Price,Low Price,Last Price,Close Price,VWAP,Total Traded Quantity,Turnover in Lacs,No. of Trades,DeliverableQty,% Dly Qt toTraded Qty
3IINFOTECH,EQ,23-May-2014,9.90,10.25,10.70,9.70,10.10,10.10,10.23,"84,99,408",869.20,"16,539","40,35,648",47.48
3IINFOTECH,EQ,26-May-2014,10.10,10.40,10.60,9.10,9.30,9.20,9.97,"59,15,990",589.88,"9,894","27,10,021",45.81
3IINFOTECH,EQ,27-May-2014,9.20,9.20,9.30,8.30,8.60,8.55,8.53,"34,95,072",298.18,"3,600","14,71,141",42.09
3IINFOTECH,EQ,28-May-2014,8.55,8.60,9.40,8.45,9.30,9.15,9.07,"36,09,261",327.27,"3,955","13,92,733",38.59
3IINFOTECH,EQ,29-May-2014,9.15,9.25,9.50,8.80,9.40,9.35,9.28,"30,13,036",279.69,"3,090","15,20,654",50.47
3IINFOTECH,EQ,30-May-2014,9.35,9.35,9.55,8.90,9.00,9.00,9.13,"13,97,140",127.53,"1,992","7,43,964",53.25

Categories

Resources