Python accepting list variable but not list from file - python

I am attempting to implement selenium and list.
This is my code:
from selenium import webdriver
pool_size = 1
def worker(item1):
try:
#driver = webdriver.Chrome()
print("Processing:", item1)
#driver.get(item1)
except Exception as e:
print('error | ' + str(e))
pool = Pool(pool_size)
items = open('file.txt', 'r').read().splitlines()
for item in items:
print(item)
pool.apply_async(worker, str(item))
pool.close()
pool.join()
If change items = open('file.txt', 'r').read().splitlines() to items = ['1','2'] it works else, it will just return nothing and end the script. Same thing for the file, if I input 1\n2
No idea why this is happening, super weird. Thanks in advance.

Related

web scraping with python selenium loop and save problem

'Hi,I want to save the data I took as csv and txt, but I couldn't.
Moreover;
How can I repeat this process multiple times?'
nextInput = driver.find_element("xpath",'//*[#id="pnnext"]/span[2]').click()
result = driver.find_elements(By.CSS_SELECTOR, ".GyAeWb cite.iUh30")
'
Code;
'
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import pandas as pd
import time
import csv
import re
driver = webdriver.Chrome()
url ="http://google.com"
driver.get(url)
searchInput = driver.find_element("xpath",'/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input')
time.sleep(1)
searchInput.send_keys("dişçi")
time.sleep(2)
searchInput.send_keys(Keys.ENTER)
time.sleep(2)
result = driver.page_source
result = driver.find_elements(By.CSS_SELECTOR, ".GyAeWb cite.iUh30")
for index,element in enumerate (result):
print(index+1,element.text)
result = []
result = list(set(result))
time.sleep(2)
nextInput = driver.find_element("xpath",'//*[#id="pnnext"]/span[2]').click()
result = driver.find_elements(By.CSS_SELECTOR, ".GyAeWb cite.iUh30")
for index,element in enumerate (result):
print(index+1,element.text)
count = 1
with open("siteler.txt","w",encoding="UTF-8") as file:
for item in result:
file.write(f"{count}-{item}\n")
count+=1
driver.close()
try:
while 1:
nextInput = driver.find_element("xpath",'//*[#id="pnnext"]/span[2]').click()
result = driver.find_elements(By.CSS_SELECTOR, ".GyAeWb cite.iUh30")
for index,element in enumerate (result):
print(index+1,element.text)
count = 1
with open("siteler.txt","w",encoding="UTF-8") as file:
for item in result:
file.write(f"{count}-{item}\n")
count+=1
except Exception as e:
print(e)
finally:
print("there is no element with '//*[#id='pnnext']/span[2]' XPATH")

Accelerate 2 loops with regex to find email address on website

I need help to find email adress on website. After some research, I found the solution but it's so long, I have a lot of datas (more than 90 000) and my code never stop.
Do you know tips to optimize/accelerate my code ?
This is my list of the URL:
http://etsgaidonsarl.site-solocal.com/
http://fr-fr.facebook.com/people/
http://ipm-mondia.com/
http://lfgenieclimatique.fr/
http://vpcinstallation.site-solocal.com
http://www.cavifroid.fr/
http://www.clim-monnier.com/
http://www.climacool.net/
I use 2 loops. The first is to find all pages of a website because the email adresse is not every time on the first page.
In the second loop, I scrall the page to find the email address, the code :
EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?\.)+[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-zA-Z]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
I think my regex is too long, it can be a problem ?
session = HTMLSession()
mailing=[]
for index, i in enumerate(link): #link is the list of the URLs
try:
r = session.get(i)
site=r.html.absolute_links
linkslist = list(r.html.absolute_links)
except:
linkslist=list(i)
for j in linkslist:
try:
r1 = session.get(j)
for re_match in re.finditer(EMAIL_REGEX, r1.html.raw_html.decode()):
mail=(re_match.group())
liste=[index,mail,j]
mailing.append(liste)
except:
pass
print(mailing)
df = pd.DataFrame(mailing, columns=['index1','mail','lien',])
Thank's for your help
I think multi-threading should do the job. your regex, i don't know what it does but assuming its working and helpful, the multi-threaded version should look like the following. I tested the code, it works.
`from threading import Thread, Lock
from requests_html import HTMLSession
import re
lock = Lock()
link = ["http://etsgaidonsarl.site-solocal.com/",
"http://fr-fr.facebook.com/people/",
"http://ipm-mondia.com/",
"http://lfgenieclimatique.fr/",
"http://vpcinstallation.site-solocal.com",
"http://www.cavifroid.fr/",
"http://www.clim-monnier.com/",
"http://www.climacool.net/"]
linklist = []
mailing = []
main_threads = []
minor_threads = []
EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?\.)+[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-zA-Z]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
def links_scraper(single_url):
try:
session = HTMLSession()
r = session.get(single_url)
site=r.html.absolute_links
the_list = list(r.html.absolute_links)
linklist.extend(list(zip([single_url for _ in range(len(the_list))], the_list)))
except Exception as e:
# print("Exception:", e)
linklist.append((single_url, single_url))
def mail_scrapper(main_url, single_link):
try:
session = HTMLSession()
r1 = session.get(single_link)
for re_match in re.finditer(EMAIL_REGEX, r1.html.raw_html.decode()):
mail=(re_match.group())
liste=[link.index(main_url),mail,single_link]
mailing.append(liste)
except Exception as e:
# print(f"Exception: {e}")
pass
def main():
for l in link:
t = Thread(target=links_scraper, args=(l,))
t.start()
main_threads.append(t)
while len(main_threads) > 0:
try:
with lock:
current_link = linklist.pop(0)
minor_thread = Thread(target=mail_scrapper, args=(current_link[0], current_link[1]))
minor_threads.append(minor_thread)
minor_thread.start()
except IndexError:
pass
for t in main_threads:
if t.isAlive() == False:
main_threads.pop(main_threads.index(t))
for t in minor_threads:
t.join()
main()
print("Mailing:", mailing)`

Selenium Python webscraper really slow

I'm a newbie getting into web scrapers. I've made something that works, but it takes hours and hours to get everything I need. I read something about using parallel processes to process the URLs but I have no clue how to go about it and incorporate it in what I already have. Help is much appreciated!
Here is my, still extremely messy, code. I'm still learning :)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import time
import random
import pprint
import itertools
import csv
import pandas as pd
start_url = "https://www.nationalevacaturebank.nl/vacature/zoeken?query=&location=&distance=city&limit=100&sort=relevance&filters%5BcareerLevel%5D%5B%5D=Starter&filters%5BeducationLevel%5D%5B%5D=MBO"
driver = webdriver.Firefox()
driver.set_page_load_timeout(20)
driver.get(start_url)
driver.find_element_by_xpath('//*[#id="form_save"]').click() #accepts cookies
wait = WebDriverWait(driver, random.randint(1500,3200)/1000.0)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
num_jobs = int(driver.find_element_by_xpath('/html/body/div[3]/div/main/div[2]/div[3]/div/header/h2/span').text)
num_pages = int(num_jobs/102)
urls = []
list_of_links = []
for i in range(num_pages+1):
try:
elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[#id="search-results-container"]//article/job/a')))
for i in elements:
list_of_links.append(i.get_attribute('href'))
j = random.randint(1500,3200)/1000.0
time.sleep(j)
if 'page=3' not in driver.current_url:
driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[6]/a').click()
else:
driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[5]/a').click()
url = driver.current_url
if url not in urls:
print(url)
urls.append(url)
else:
break
except:
continue
set_list_of_links = list(set(list_of_links))
print(len(set_list_of_links), "results")
driver.close()
def grouper(n, iterable):
it = iter(iterable)
while True:
chunk = tuple(itertools.islice(it, n))
if not chunk:
return
yield chunk
def remove_empty_lists(l):
keep_going = True
prev_l = l
while keep_going:
new_l = remover(prev_l)
#are they identical objects?
if new_l == prev_l:
keep_going = False
#set prev to new
prev_l = new_l
#return the result
return new_l
def remover(l):
newlist = []
for i in l:
if isinstance(i, list) and len(i) != 0:
newlist.append(remover(i))
if not isinstance(i, list):
newlist.append(i)
return newlist
vacatures = []
chunks = grouper(100, set_list_of_links)
chunk_count = 0
for chunk in chunks:
chunk_count +=1
print(chunk_count)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
for url in chunk:
driver = webdriver.Firefox()
driver.set_page_load_timeout(20)
try:
driver.get(url)
driver.find_element_by_xpath('//*[#id="form_save"]').click() #accepts cookies
vacature = []
vacature.append(url)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
elements = driver.find_elements_by_tag_name('dl')
p_elements = driver.find_elements_by_tag_name('p')
li_elements = driver.find_elements_by_tag_name('li')
for i in elements:
if "Salaris:" not in i.text:
vacature.append(i.text)
running_text = list()
for p in p_elements:
running_text.append(p.text)
text= [''.join(running_text)]
remove_ls = ['vacatures', 'carrièretips', 'help', 'inloggen', 'inschrijven', 'Bezoek website', 'YouTube',
'Over Nationale Vacaturebank', 'Werken bij de Persgroep', 'Persberichten', 'Autotrack', 'Tweakers',
'Tweakers Elect', 'ITBanen', 'Contact', 'Carrière Mentors', 'Veelgestelde vragen',
'Vacatures, stages en bijbanen', 'Bruto Netto Calculator', 'Salariswijzer', 'Direct vacature plaatsen',
'Kandidaten zoeken', 'Bekijk de webshop', 'Intermediair', 'Volg ons op Facebook']
for li in li_elements:
if li.text not in remove_ls:
text.append(li.text)
text = ''. join(text)
vacature.append(text)
vacatures.append(vacature)
driver.close()
except TimeoutException as ex:
isrunning = 0
print("Exception has been thrown. " + str(ex))
driver.close()
except NoSuchElementException:
continue
Python Selenium webdriver is not thread-safe. This means your browser can not correctly consume asynchronous calls from multiple threads. Try to scrape websites with requests and bs4 + lxml. It's much faster than Selenium. This answer can be helpful.
You're using Firefox which is slower than Chrome in almost all real-life applications.
Xpath is the slowest selector, match by id or class. If that is not possible then by CSS.
Use headless mode and don't load images unless you need to.
You can use Scrapy and this is much faster and more flexible than anything. See link for more information.

Python: requests hang for hours

I am using requests to resolve urls for about 410K check-in data. However, the process hang somewhere for hours and I am not sure where the problem is. I did the same thing for 1.7M pieces of data before and it worked well. Here is my code:
pat = re.compile("(?P<url>https?://[^\s]+)") # always compile it
def resolve_url(text):
url = 'before'
long_url = 'after'
error = 'none'
match = pat.search(text)
if match:
url = match.group("url")
try:
long_url = requests.head(url, allow_redirects=True).url
except requests.exceptions.RequestException as e:
error = e
return (url, long_url, error)
pool = multiprocessing.Pool(200)
resolved_urls = []
for i, res in enumerate(pool.imap(resolve_url, text_with_url)):
resolved_urls.append(res)
if i%10000 == 0 and i > 0:
print("%d elements have been processed, %2.5f seconds" %(i+1, time.time()-t0))
fout = open("./yangj/resolved_urls_%d_requests.pkl"%(i+1),"w")
pickle.dump(resolved_urls, fout)
fout.close()
resolved_urls = []
fout = open("./yangj/resolved_urls_last_requests.pkl","w")
pickle.dump(resolved_urls, fout)
fout.close()
I was wondering whether the problem is because of some exception that I need to write code to recover. I have looked through requests documents and previous similar questions but I didn't find matching answers. Any idea to solve the problem?

Searching Tweets in Python

Have a look at the following code:
q = '#MentionSomeoneImportantForYou'
count = 100
search_results = twitter_api.search.tweets(q=q, count=count)
#twitter_api is predefined and is working fine.
statuses = search_results['statuses']
for _ in range(5):
print "Length of statuses", len(statuses)
try:
next_results = search_results['search_metadata']['next_results']
except KeyError, e: # No more results when next_results doesn't exist
break
kwargs = dict([ kv.split('=') for kv in next_results[1:].split("&") ])
The last code throws an error that 'next_results' is not defined.
Where did I go wrong on this?
I really don't get why you have to
next_results = search_results['search_metadata']['next_results']
while this line will return the same result in 5 times ?
Anw, "next_results" is not defined means that the line above hasn't been reached even 1 time.
How about
print search_results['search_metadata']
to see exactly how was the API's response ?
This code works perfectly!
# Import unquote to prevent url encoding errors in next_results
from urllib.parse import unquote
q='#Ethiopia'
count = 100
# See https://dev.twitter.com/docs/api/1.1/get/search/tweets
search_results = twitter_api.search.tweets(q=q, count=count)
statuses = search_results['statuses']
# Iterate through 5 more batches of results by following the cursor
print(search_results['search_metadata'])
for _ in range(5):
print("Length of statuses", len(statuses))
try:
#print(search_results['search_metadata'])
next_results = search_results['search_metadata']['next_results']
except KeyError: # No more results when next_results doesn't exist
break
# Create a dictionary from next_results, which has the following form:
# ?max_id=313519052523986943&q=NCAA&include_entities=1
kwargs = dict([ kv.split('=') for kv in unquote(next_results[1:]).split("&") ])
search_results = twitter_api.search.tweets(**kwargs)
statuses += search_results['statuses']
# Show one sample search result by slicing the list...
print(json.dumps(statuses[0], indent=1))

Categories

Resources