This is the code that gets the restaurant reviews. I am collecting reviews.
The range is specified and imported. But there is a problem. Each store has a different number of reviews. Store with little reviews should go to next store soon.
I am suffering from too large a scope. But it can not reduce the scope. This is because some shops have reviews in this range.
How can I work effectively?
I saw find all?(element) searching this code. But I do not know if I've mistakenly applied my code.
#python3
import sys
from bs4 import BeautifulSoup
import urllib.request
import requests
from urllib.parse import quote
import time
import os
import xlwt
import random
import re
FISRT_URL = "https://www.yelp.com/search?
find_desc=Korean+Food&find_loc=Seattle,+WA&start="
LAST_URL = "&cflt=korean"
def get_link(URL, doc_name):
global jisu_i
global num
global page
for jisu_i in range(1):
current_page_num = 20 + jisu_i*10
position = URL.index('t=')
URL_with_page_num = URL[: position+2] + str(current_page_num) \
+ URL[position+2 :]
print(URL_with_page_num)
importurl = URL_with_page_num
r = requests.get(importurl)
soup = BeautifulSoup(r.content.decode('euc-kr','replace'), "lxml")
time.sleep(random.randint(10, 15))
for title in soup.find_all('h3')[page+2:21]:
page = page + 1
title_link = title.select('a')
for jisu_m in range(130):
print(page)
last_URL = title_link[0]['href']
print(last_URL)
first_URL = "https://www.yelp.com"
global article_URL
article_URL = first_URL + last_URL
time.sleep(random.randint(15, 30))
jisuurl = article_URL
for k in range(99): #
jisu_page_num = 0 + k * 20 #
position = jisuurl.index('?')
URL_with_page_num = jisuurl[: position + 1] + str("start=") + str(jisu_page_num)
jisu_with_page_num = URL_with_page_num
print(jisu_with_page_num)
jisu_importurl = jisu_with_page_num
get_text(URL, jisu_importurl, doc_name)
time.sleep(random.randint(40,180))
Yelp has a very well documented API here: https://www.yelp.com/developers/documentation/v3
This is the only reliable way of interacting with the site programatically.
Related
I have the code below, the code below will take the value from the pluang.com site, but I want to make 2 colors so that I don't get confused and it's easy to understand.
I want if the scraped value is below (-) 860,000 (mis. 850.000) then the print must be red, if it is above (+) 860,000 (mis. 870.000) then the print must be green.
Can anyone help? Thank you.
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from datetime import datetime
import time
import os
class bcolors:
SELL = '\033[92m'
BUY = '\033[31;1m'
os.system("clear")
while True:
url = "https://pluang.com/produk/pluang-emas"
UserAgent = Request(url, headers={'User-Agent':'Mozilla/5.0'})
html = urlopen(UserAgent)
data = BeautifulSoup(html, "html.parser")
items = data.findAll("h1", {"id":"gold-price"})
for item in items:
print(bcolors.BUY + " SELL GOLD:",item.get_text().strip(),"-",datetime.now())
time.sleep(59)
You already have the class to color you terminal.
You will need to somehow get the value (which you are displaying it) check if it's bigger or smaller then your condition then print accordingly.
In your code item.get_text() printing something like: Rp860.530/g. I believe here Rp is currency and g would be gram. Let's get rid of them:
import re
numbers = rr = list(map(float, re.findall("[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", item.get_text())))
see: https://stackoverflow.com/a/4289348/2681662
This will return all numbers in your text as a list.
Now we can check the value if it's smaller or bigger then our threshold.
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from datetime import datetime
import time
import os
import re
class bcolors:
BUY = '\033[92m'
SELL = '\033[31;1m'
os.system("clear")
while True:
url = "https://pluang.com/produk/pluang-emas"
UserAgent = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(UserAgent)
data = BeautifulSoup(html, "html.parser")
items = data.findAll("h1", {"id": "gold-price"})
for item in items:
rr = list(map(float, re.findall("[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", item.get_text())))
if rr[0] < 860:
print(bcolors.SELL + " SELL GOLD:", item.get_text().strip(), "-", datetime.now())
else:
print(bcolors.BUY + " BUY GOLD:", item.get_text().strip(), "-", datetime.now())
time.sleep(59)
I'm not sure if the red and green part is correctly written. You can change it anyways.
I think this is what you're after. Your question definitely implied that you were asking how to print colors. You would have had more focused answers if you had just said you needed to know how to extract the price.
There's only one price returned here, so the inner loop seems silly.
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from datetime import datetime
import time
import os
class bcolors:
SELL = '\033[92m'
BUY = '\033[31;1m'
WHITE = '\033[37m'
url = "https://pluang.com/produk/pluang-emas"
while True:
UserAgent = Request(url, headers={'User-Agent':'Mozilla/5.0'})
html = urlopen(UserAgent)
data = BeautifulSoup(html, "html.parser")
item = data.findAll("h1", {"id":"gold-price"})[0].get_text()
price = int(item.strip('Rp/g').replace('.',''))
if price > 860000:
print(bcolors.SELL + " SELL GOLD:",item,"-",datetime.now())
else:
print(bcolors.BUY + " BUY GOLD:",item,"-",datetime.now())
print(bcolors.WHITE,end='')
Python code:
url = 'https://www.basketball-reference.com/players/'
initial = list(string.ascii_lowercase)
initial_url = [url + i for i in initial]
html_initial = [urllib.request.urlopen(i).read() for i in initial_url]
soup_initial = [BeautifulSoup(i, 'html.parser') for i in html_initial]
tags_initial = [i('a') for i in soup_initial]
print(tags_initial[0][50])
Results example:
Shareef Abdur-Rahim
From the example above, I want to extract the name of the players which is 'Shareef Abdur-Rahim', but I want to do it for all the tags_initial lists,
Does anyone have an idea?
Could you modify your post by adding your code so that we can help you better?
Maybe that could help you :
name = soup.findAll(YOUR_SELECTOR)[0].string
UPDATE
import re
import string
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://www.basketball-reference.com/players/'
# Alphabet
initial = list(string.ascii_lowercase)
datas = []
# URLS
urls = [url + i for i in initial]
for url in urls:
# Soup Object
soup = BeautifulSoup(urlopen(url), 'html.parser')
# Players link
url_links = soup.findAll("a", href=re.compile("players"))
for link in url_links:
# Player name
datas.append(link.string)
print("datas : ", datas)
Then, "datas" contains all the names of the players, but I advise you to do a little processing afterwards to remove some erroneous information like "..." or perhaps duplicates
There are probably better ways but I'd do it like this:
html = "a href=\"/teams/LAL/2021.html\">Los Angeles Lakers</a"
index = html.find("a href")
index = html.find(">", index) + 1
index_end = html.find("<", index)
print(html[index:index_end])
If you're using a scraper library it probably has a similar function built-in.
I have build a list which contains href from website and i wanna randomly select one of this link, how can i do that?
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles = a['href']
x = random.choice(articles)
print(x)
That code work, but selecting only random index from all of the objects
You're very close to the answer. You just need to do this:
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles.append(a['href'])
x = random.choice(articles)
print(x)
getItems()
The changes are:
We add each article to the articles array.
The random choice is now done after the loop, rather than inside the loop.
I'm fairly new when it comes to this and i've been working on this scrape of a webpage for a number of days now. I have been actively trying to avoid asking the question but i'm seriously stuck.
My Problems
The location I have the span loop currently positioned it prints all the prices for each listing, each time it runs through the "for product" loop. If I place it outside of this loop, it either prints the first in the list or the last in the list. How do I extract the price and print it along side each individual product.
I understand i've a lot of unused imports listed. These were just various avenues I was trying and yet to remove them.
The end goal here is to either push to json or csv file (also not currently written - but have a fair idea how to approach this aspect once have data.
from bs4 import BeautifulSoup
import requests
import shutil
import csv
import pandas
from pandas import DataFrame
import re
import os
import urllib
import locale
import json
from selenium import webdriver
os.environ["PYTHONIOENCODING"] = "utf-8"
browser = webdriver.Chrome(executable_path='C:/Users/andrew.glass/chromedriver.exe')
browser.get("https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html")
URL = 'https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
products = soup.find_all("div", "GC62 Product")
for product in products:
#title
title = product.find("h3")
titleText = title.text if title else ''
#manufacturer name
manufacturer = product.find("div", "GC5 ProductManufacturer")
manuText = manufacturer.text if manufacturer else ''
#image location
img = product.find("div", "ProductImage")
imglinks = img.find("a") if img else ''
imglinkhref = imglinks.get('href') if imglinks else ''
imgurl = 'https://www.mcavoyguns.co.uk/contents'+imglinkhref
#print(imgurl.replace('..', ''))
#description
description = product.find("div", "GC12 ProductDescription")
descText = description.text if description else ''
#more description
more = product.find("div", "GC12 ProductDetailedDescription")
moreText = more.text if more else ''
#price - not right
spans = browser.find_elements_by_css_selector("div.GC20.ProductPrice span")
for i in range(0,len(spans),2):
span = spans[i].text
print(span)
i+=1
print(titleText)
print(manuText)
print(descText)
print(moreText)
print(imgurl.replace('..', ''))
print("\n")
output:
£1,695.00
£1,885.00
£1,885.00
£2,015.00
£2,175.00
£2,175.00
£2,385.00
£2,115.00
£3,025.00
£3,315.00
£3,635.00
£3,925.00
£2,765.00
£3,045.00
£3,325.00
£3,615.00
£3,455.00
£2,815.00
£3,300.00
£3,000.00
£7,225.00
£7,555.00
£7,635.00
£7,015.00
£7,355.00
12G Beretta DT11 Trap Adjustable
beretta
Click on more for full details.
You may order this gun online, however due to UK Legislation we are not permitted to ship directly to you, we can however ship to a registered firearms dealer local to you. Once we receive your order, we will contact you to arrange which registered firearms dealer you would like the gun to be shipped to.
DT 11 Trap (Steelium Pro)
12
2 3/4"
10x10 rib
3/4&F
30"/32" weight; 4k
https://www.mcavoyguns.co.uk/contents/media/l_dt11_02.jpg
Comments from #baduker pointed me in the correct direction - indentations. Thanks!
import requests
import shutil
import csv
import pandas
from pandas import DataFrame
import re
import os
import urllib
import locale
import json
from selenium import webdriver
os.environ["PYTHONIOENCODING"] = "utf-8"
browser = webdriver.Chrome(executable_path='C:/Users/andrew.glass/chromedriver.exe')
browser.get("https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html")
URL = 'https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
products = soup.find_all("div", "GC62 Product")
for product in products:
#title
title = product.find("h3")
titleText = title.text if title else ''
#manufacturer name
manufacturer = product.find("div", "GC5 ProductManufacturer")
manuText = manufacturer.text if manufacturer else ''
#image location
img = product.find("div", "ProductImage")
imglinks = img.find("a") if img else ''
imglinkhref = imglinks.get('href') if imglinks else ''
imgurl = 'https://www.mcavoyguns.co.uk/contents'+imglinkhref
#print(imgurl.replace('..', ''))
#description
description = product.find("div", "GC12 ProductDescription")
descText = description.text if description else ''
#more description
more = product.find("div", "GC12 ProductDetailedDescription")
moreText = more.text if more else ''
#price - not right
spans = browser.find_elements_by_css_selector("div.GC20.ProductPrice span")
for i in range(0,len(spans),2):
span = spans[i].text
print(span)
i+=1
print(titleText)
print(manuText)
print(descText)
print(moreText)
print(imgurl.replace('..', ''))
print("\n")
I am trying to get the user statuses from Weibo, but I keep having this error.
import re
import string
import sys
import os
import urllib
import urllib2
from bs4 import BeautifulSoup
import requests
from lxml import etree
reload(sys)
sys.setdefaultencoding('utf-8')
if(len(sys.argv)>=2):
user_id = (int)(sys.argv[1])
else:
user_id = (int)(raw_input("input user_id: "))
cookie = {"Cookie": "******my cookies"}
url = 'http://weibo.cn/u/%d?filter=1&page=1'%user_id
html = requests.get(url, cookies = cookie).content
selector = etree.HTML(html)
pageNum = (int)(selector.xpath('//input[#name="mp"]')[0].attrib['value'])
result = ""
urllist_set = set()
word_count = 1
image_count = 1
print 'spider is ready...'
for page in range(1,pageNum+1):
url = 'http://weibo.cn/u/%d?filter=1&page=%d'%(user_id,page)
lxml = requests.get(url, cookies = cookie).content
selector = etree.HTML(lxml)
content = selector.xpath('//span[#class="ctt"]')
for each in content:
text = each.xpath('string(.)')
if word_count>=4:
text = "%d :"%(word_count-3) +text+"\n\n"
else :
text = text+"\n\n"
result = result + text
word_count += 1
fo = open("/Users/apple/Desktop/%s"%user_id, "wb")
fo.write(result)
word_path=os.getcwd()+'/%d'%user_id
print 'done'
Error:
File "weibo_spider.py", line 25, in <module>
pageNum = (int)(selector.xpath('//input[#name="mp"]')[0].attrib['value'])
IndexError: list index out of range
You are assuming selector.path will always find something, but this isn't true most of cases. So build habit of defensive programming. See Defensive Programming
Try replacing
pageNum = (int)(selector.xpath('//input[#name="mp"]')[0].attrib['value'])
With:
controls = selector.xpath('//input[#name="mp"]')
if controls:
pageNum = int(controls[0].attrib['value'])