Help what to do. When you try to search, it often displays a name with an encoding error in the squeak. That is, for example (УкÑаинÑкий VOD поÑÑаÐ)
Code
from base64 import encode
import requests
from lxml.html import fromstring
from googlesearch import search
from time import sleep as wait
import os
os.system('cls || clear')
query = input('Уведіть ключові слова : ')
list_url = []
while 1:
try:
col = int(input('Количество запросов : '))
break
except ValueError:
print('Введите число')
for j in search(query, tld="co.in", num=col, stop=col, pause=2):
list_url.append(j)
if list_url != []:
for i in list_url:
wait(0.1)
r = requests.get(i)
tree = fromstring(r.content)
Title = tree.findtext('.//title')
print(f'\r[{Title}] - {i}\n')
try:
os.remove('.google-cookie')
except FileNotFoundError:
pass
else:
print('Empty')
input('\nExit\n')
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
I have found a peace of code I can use in my own script, I am not quite sure how the code all works, but it does :-), but as a newbee I dont know how to call it, it look like this:
"""
This script uses a simplified version of the one here:
https://snipt.net/restrada/python-selenium-workaround-for-full-page-screenshot-using-chromedriver-2x/
It contains the *crucial* correction added in the comments by Jason Coutu.
"""
import sys
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import unittest
import time
import util
from random import randint
class Test(unittest.TestCase):
""" Demonstration: Get Chrome to generate fullscreen screenshot """
def setUp(self):
self.driver = webdriver.Chrome()
def tearDown(self):
self.driver.quit()
def test_fullpage_screenshot(self):
''' Generate document-height screenshot '''
url = "https://www.(a login page)
# Login on Stockopedia
self.driver.find_element_by_id('username').send_keys('XXXXXXXXXXXXX')
self.driver.find_element_by_id('password').send_keys('XXXXXXXXXXXXX')
self.driver.find_element_by_id('auth_submit').click()
time.sleep(5)
# Indsæt tickerkode
self.driver.find_element_by_name('searchQuery').send_keys(var1, Keys.ENTER)
time.sleep(5)
self.driver.find_element_by_name('searchQuery').send_keys('', Keys.ENTER)
time.sleep(randint(10, 60))
util.fullpage_screenshot(self.driver, "test.jpg")
if __name__ == "__main__":
unittest.main(argv=[sys.argv[0]])
Can anybody help me, so I can call it from another script with variable var1 as a argument
I have now add the script that call the Class, the call is between the 2 row´s of stars *.
But as I see it, not even an instance of the class is created, what do I do wrong ?
import bs4 as bs
import datetime as dt
import os
import logging
import pandas as pd
from pandas_datareader import data as pdr
import pickle
import requests
import re
import test
import fix_yahoo_finance as yf
import time
yf.pdr_override
# ticker_index indeholder: stien til det website den skal hente tickerkoderne,
# klassenavnet på den tabel tickerkoderne i, nummer på den kolonne i tabellen
# tickerkoderne ligger i, og navnet på den fil programmet skal ligge
# tickerkoderne i
# ticker_indexes = [['dk_large_cap', 'http://www.nasdaqomxnordic.com/index/index_info?Instrument=SE0001776667',
# 'tablesorter tablesorter-default', 1, 'dk_large_cap_tickers.pickle']]
ticker_indexes = [['c25', 'https://en.wikipedia.org/wiki/OMX_Copenhagen_25',
'wikitable sortable', 2, 'c25_tickers.pickle'],
['dax', 'https://en.wikipedia.org/wiki/DAX',
'wikitable sortable', 3, 'dax_tickers.pickle'],
['sto30', 'https://da.wikipedia.org/wiki/OMXS30',
'wikitable sortable', 2, 'sto30_tickers.pickle'],
['obx25', 'https://en.wikipedia.org/wiki/OBX_Index',
'wikitable sortable', 2, 'obx25_tickers.pickle'],
['nasdaq100', 'https://www.cnbc.com/nasdaq-100/',
'data quoteTable', 0, 'nasdaq100.pickle']]
logging.basicConfig(filename='Share prices logfile.log', level=logging.INFO,
format='%(asctime)s: %(levelname)s: %(message)s')
def save_index_tickers(indexname, tickerpath, table_class_id, tickercol,
tickerlist):
try:
resp = requests.get(tickerpath)
soup = bs.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': table_class_id})
tickers = []
for row in table.findAll('tr')[1:]:
ticker = row.findAll('td')[tickercol].text.replace('.', '-')
ticker = ticker.strip('\n')
if (indexname == 'sto30') or (indexname == 'obx25'):
ticker = ticker[1:]
tickers.append(ticker)
print(ticker)
with open('C:\\Users\\Johnn\\Desktop\\FA Sheet\\pickle/' + tickerlist, "wb") as f:
pickle.dump(tickers, f)
logging.info(str(indexname) + ' ' + str(tickerlist) + ' OK')
return tickers
except Exception as e:
logging.warning(str(indexname) + str(tickerlist) + str(e))
# save__screendump
def get_scrdump_from_stop(indexname, tickerpath, table_class_id, tickercol,
tickerlist, reload=False):
try:
if reload:
logging.info('RELOAD ' + str(indexname) + str(tickerlist))
tickers = save_index_tickers(indexname, tickerpath, table_class_id,
tickercol, tickerlist)
else:
with open('C:\\Users\\Johnn\\Desktop\\FA Sheet\\pickle/' + tickerlist, "rb") as f:
tickers = pickle.load(f)
if not os.path.exists('C:\\Users\\Johnn\\Desktop\\FA Sheet\\Sheet'):
os.makedirs('C:\\Users\\Johnn\\Desktop\\FA Sheet\\Sheet')
# ******************************************************************************
for ticker in tickers:
obj = test.Test(var1)
obj.setUp()
obj.test_fullpage_screenshot()
obj.tearDown()
#*******************************************************************************
logging.info(str(indexname) + ' Sheet downloaded OK')
except Exception as e:
logging.warning(str(indexname) + str(tickerlist) + str(e))
def main(ticker_indexes):
for ticker_index in ticker_indexes:
print('*****')
print(ticker_index[0])
print('*****')
save_index_tickers(ticker_index[0], ticker_index[1], ticker_index[2],
ticker_index[3], ticker_index[4])
get_scrdump_from_stop(ticker_index[0], ticker_index[1], ticker_index[2],
ticker_index[3], ticker_index[4])
logging.info('Finished')
main(ticker_indexes)
import your file as such
If the importing class is in another dir.
import dir1.dir2.filename
if you are calling the class from a file created in the same dir
import filename
To create an object of your file's class.
obj = filename.Test(var1)
Then the rest of the code will look like this.
obj.setUp()
obj.test_fullpage_screenshot()
obj.tearDown()
If you're wondering what self means in the code.
To execute the function in the class, you need the first argument which is the own class object self.
why we do this is so that you won't call the class like this, without instantiating the object.
filename.Test.setUp()
I am trying to get the user statuses from Weibo, but I keep having this error.
import re
import string
import sys
import os
import urllib
import urllib2
from bs4 import BeautifulSoup
import requests
from lxml import etree
reload(sys)
sys.setdefaultencoding('utf-8')
if(len(sys.argv)>=2):
user_id = (int)(sys.argv[1])
else:
user_id = (int)(raw_input("input user_id: "))
cookie = {"Cookie": "******my cookies"}
url = 'http://weibo.cn/u/%d?filter=1&page=1'%user_id
html = requests.get(url, cookies = cookie).content
selector = etree.HTML(html)
pageNum = (int)(selector.xpath('//input[#name="mp"]')[0].attrib['value'])
result = ""
urllist_set = set()
word_count = 1
image_count = 1
print 'spider is ready...'
for page in range(1,pageNum+1):
url = 'http://weibo.cn/u/%d?filter=1&page=%d'%(user_id,page)
lxml = requests.get(url, cookies = cookie).content
selector = etree.HTML(lxml)
content = selector.xpath('//span[#class="ctt"]')
for each in content:
text = each.xpath('string(.)')
if word_count>=4:
text = "%d :"%(word_count-3) +text+"\n\n"
else :
text = text+"\n\n"
result = result + text
word_count += 1
fo = open("/Users/apple/Desktop/%s"%user_id, "wb")
fo.write(result)
word_path=os.getcwd()+'/%d'%user_id
print 'done'
Error:
File "weibo_spider.py", line 25, in <module>
pageNum = (int)(selector.xpath('//input[#name="mp"]')[0].attrib['value'])
IndexError: list index out of range
You are assuming selector.path will always find something, but this isn't true most of cases. So build habit of defensive programming. See Defensive Programming
Try replacing
pageNum = (int)(selector.xpath('//input[#name="mp"]')[0].attrib['value'])
With:
controls = selector.xpath('//input[#name="mp"]')
if controls:
pageNum = int(controls[0].attrib['value'])
I'm getting this error:
NameError: name 'htmltext' is not defined
It comes from the code below:
from bs4 import BeautifulSoup
import urllib
import urllib.parse
url = "http://nytimes.com"
urls = [url]
visited = [url]
while len(urls) > 0:
try:
htmltext = urllib.urlopen(urls[0]).read()
except:
print(urls[0])
soup = BeautifulSoup(htmltext)
urls.pop(0)
print(soup.findAll('a',href = true))
In Python 3.x, you have to import urllib.request instead of urllib. Then, change the line:
htmltext = urllib.urlopen(urls[0]).read()
to:
htmltext = urllib.request.urlopen(urls[0]).read()
Finally, change true to True.
I'm trying to find and print all .com urls from a large text file using regex. As there are approx 40 different urls I'm wondering if there is a way to search for them without doing it one by one.
The code I used gets xxxx.com but is missing the https//:www at the beginning. Can anyone tell me how I get the full result? Thank you in advance!
import re
url = len(".com")
re = re.compile(r'\w*.com\b', url)
for line in open("report.txt"):
for url in re.findall(line):
print url
This seems to work:
#!/usr/local/cpython-2.7/bin/python
import re
def main():
regex = re.compile(r'https?://[^ \t]*.com\b', re.MULTILINE | re.DOTALL)
with open('logs.txt', 'r') as file_:
text = file_.read()
for url in regex.findall(text):
print(url)
main()
HTH
#!/usr/bin/python
import urllib
import urlparse
import re
import requests
#
# A class for dealing with links
#
class linkGrabber:
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
#
# Remove White space and hash tags
#
def clean(self,link):
link = re.sub(' ','',link)
link = re.sub("#",'',link)
return link
def depth(self,link):
return len(urlparse.urlparse(url).path.split("/")) -1
def isAbsolute(self,link):
return len(urlparse.urlparse(link).netloc) > 0
def isRelative(self,link):
return len(urlparse.urlparse(link).netloc) < 1
def grab(self,markup,*args):
links = self.linkregex.findall(markup)
relative = []
absolute = []
for this in links:
#this = urlparse.urlparse(this)
if self.isAbsolute(this) == True:
absolute.append(this)
elif self.isAbsolute(this) == False:
relative.append(this)
if len(args) <=0:
return relative + absolute
elif "abs" in args:
return absolute
else:
return relative