Unable to run KubernetesPodOperator - python

I am using airflow 2.4.3 and running KubernetesPodOperator
Below is the code and error:-
Please help me with creating a KubernetesPosOperator in python. I have tried on both GCP and Azure.
Also adding the kubernetes documentation for reference:-
https://airflow.apache.org/docs/apache-airflow-providers-cncf-kubernetes/stable/_api/airflow/providers/cncf/kubernetes/operators/kubernetes_pod/index.html#airflow.providers.cncf.kubernetes.operators.kubernetes_pod.KubernetesPodOperator
I can also share any other info if required.
from kubernetes.client import models as k8s
from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator
#custom modules
from spark_API.spark_submit import SparkSubmit
#import modules
import json
import datetime
import logging
import airflow_API
import time
airflow_dag_object = AirflowDagUtilities("aaa_test_airflow_api")
def def_func1(**kwargs):
print("In func1")
namespace = "segmentation-pipeline"
docker_image = "****:v6" # name commented out
is_delete_operator_pod = True
docker_image_creds = [k8s.V1LocalObjectReference("****")] # harbor name commented out
submit_command = ["/bin/bash","-c"]
max_cores = 60
driver_memory = "4g"
executor_memory = "4g"
submit_args = "/usr/local/spark/bin/spark-submit --master local[" + str(max_cores) + "] --driver-memory " + \
driver_memory + " --executor-memory " + executor_memory + " "
submit_spark_pipeline_config_conf = "--conf " + '\'' + 'spark.pipelineConfig' + "=" + json.dumps(_infra_config.get_infra_config(),separators=(',',':')) + '\'' + " "
submit_spark_broadcast_timeout = "--conf " + '\"' + "spark.sql.broadcastTimeout" + "=" + str("36000") + '\"' + " "
submit_spark_max_result_size = "--conf " + '\"' + "spark.driver.maxResultSize" + "=" + str("0") + '\"' + " "
final_dependency_jars = ["./resources/mysql_connector_java_5.1.45.jar",\
"./resources/commons_httpclient_3.0.1.jar"]
dependency_jars_string = ','.join(list(set(final_dependency_jars)))
submit_spark_dependency_jars = "--conf " + '\"' + "spark.jars" + "=" + dependency_jars_string + '\"' + " "
extra_conf = []
extra_conf_final = []
for conf in extra_conf:
conf_appended_string = "--conf " + '\"' + conf + '\'' + " "
extra_conf_final.append(conf_appended_string)
extra_conf = " ".join(extra_conf_final) + " "
airflow_task_settings = airflow_API.extract_airflow_task_details(kwargs['task_instance'])
submit_spark_airflow_task_details = "--conf " + '\"' + "spark.airflowTaskDetails" + "=" + json.dumps(airflow_task_settings) + '\'' + " "
common_submit_args_beginning = submit_args + submit_spark_broadcast_timeout + submit_spark_max_result_size + submit_spark_dependency_jars + extra_conf + submit_spark_airflow_task_details
application_resource = "/update_scores.py"
application_arguments = ["test_args"]
string_application_arguments = " "
for i in range(0,len(application_arguments)):
string_application_arguments = string_application_arguments + " " + json.dumps(application_arguments[i])
common_submit_args_end = application_resource + string_application_arguments
platform_utilities = PlatformUtilities(_infra_config)
print("platform_utilities.get_python_modules_path() -> ",str(platform_utilities.get_python_modules_path()))
submit_spark_python_module_path = "--conf " + '\"' + "spark.modulePath" + "=" + str(platform_utilities.get_python_modules_path()) + '\"' + " "
submit_spark_args = [common_submit_args_beginning + submit_spark_pipeline_config_conf + submit_spark_python_module_path + common_submit_args_end]
print("submit_spark_args -> ",submit_spark_args)
submit_in_cluster = True
submit_spark_pod_affinity = k8s.V1Affinity(
node_affinity=k8s.V1NodeAffinity(k8s.V1NodeSelectorTerm(
match_expressions=[
k8s.V1NodeSelectorRequirement(key="****", operator="In", values=["n2-highmem-8"]),
k8s.V1NodeSelectorRequirement(key="deployment", operator="In", values=["dynamic"]),
]
)
)
)
submit_spark_pod_tolerations = [k8s.V1Toleration(key="deployment", operator="Equal", value="dynamic", effect="NoSchedule")]
application_name = "test_airflow_api_test_task_id"
container_resources = k8s.V1ResourceRequirements(
requests={
'memory': str("10Gi"),
'cpu': str("2")
},
limits={
'memory': str("50Gi"),
'cpu': str("5")
}
)
submit_startup_timeout_seconds = 600
submit_get_logs = True
kube_submssion = KubernetesPodOperator(namespace = namespace,
image = docker_image,
is_delete_operator_pod = is_delete_operator_pod,
image_pull_secrets = docker_image_creds,
cmds = submit_command,
arguments = submit_spark_args,
in_cluster = submit_in_cluster,
affinity = submit_spark_pod_affinity,
tolerations = submit_spark_pod_tolerations,
container_resources = container_resources,
name = application_name,
task_id = application_name,
startup_timeout_seconds = submit_startup_timeout_seconds,
get_logs = submit_get_logs
)
kube_submssion.execute(context = None)
def def_func2(**kwargs):
print("In func2")
dag_base = airflow_dag_object.get_dag_object()
func1=PythonOperator(
task_id='func1',
provide_context=True,
python_callable=def_func1,
dag=dag_base
)
func2=PythonOperator(
task_id='func2',
provide_context=True,
python_callable=def_func2,
dag=dag_base
)
func1 >> func2
OUTPUT ERROR:-
Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.7/site-packages/airflow/providers/cncf/kubernetes/operators/kubernetes_pod.py", line 419, in execute
context=context,
File "/home/airflow/.local/lib/python3.7/site-packages/airflow/providers/cncf/kubernetes/operators/kubernetes_pod.py", line 387, in get_or_create_pod
pod = self.find_pod(self.namespace or pod_request_obj.metadata.namespace, context=context)
File "/home/airflow/.local/lib/python3.7/site-packages/airflow/providers/cncf/kubernetes/operators/kubernetes_pod.py", line 371, in find_pod
label_selector=label_selector,
File "/home/airflow/.local/lib/python3.7/site-packages/kubernetes/client/api/core_v1_api.py", line 15697, in list_namespaced_pod
return self.list_namespaced_pod_with_http_info(namespace, **kwargs) # noqa: E501
File "/home/airflow/.local/lib/python3.7/site-packages/kubernetes/client/api/core_v1_api.py", line 15826, in list_namespaced_pod_with_http_info
collection_formats=collection_formats)
File "/home/airflow/.local/lib/python3.7/site-packages/kubernetes/client/api_client.py", line 353, in call_api
_preload_content, _request_timeout, _host)
File "/home/airflow/.local/lib/python3.7/site-packages/kubernetes/client/api_client.py", line 184, in __call_api
_request_timeout=_request_timeout)
File "/home/airflow/.local/lib/python3.7/site-packages/kubernetes/client/api_client.py", line 377, in request
headers=headers)
File "/home/airflow/.local/lib/python3.7/site-packages/kubernetes/client/rest.py", line 244, in GET
query_params=query_params)
File "/home/airflow/.local/lib/python3.7/site-packages/kubernetes/client/rest.py", line 234, in request
raise ApiException(http_resp=r)
kubernetes.client.exceptions.ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Audit-Id': '6ab39ea1-f955-4481-b3eb-7b3abe747a7c', 'Cache-Control': 'no-cache, private', 'Content-Type': 'application/json', 'X-Kubernetes-Pf-Flowschema-Uid': '8e487991-120d-49d0-940a-ace0b0e64421', 'X-Kubernetes-Pf-Prioritylevel-Uid': '8f6ab0b3-abdf-4782-994c-2f0f247592d2', 'Date': 'Thu, 12 Jan 2023 13:13:20 GMT', 'Content-Length': '169'})
HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"found ',', expected: !, identifier, or 'end of string'","reason":"BadRequest","code":400}

In previous version of airflow < 2.3 KubernetesPodOperator used to work with None context
As mentioned in your question
kube_submssion = KubernetesPodOperator(namespace = namespace,
image = docker_image,
is_delete_operator_pod = is_delete_operator_pod,
image_pull_secrets = docker_image_creds,
cmds = submit_command,
arguments = submit_spark_args,
in_cluster = submit_in_cluster,
affinity = submit_spark_pod_affinity,
tolerations = submit_spark_pod_tolerations,
container_resources = container_resources,
name = application_name,
task_id = application_name,
startup_timeout_seconds = submit_startup_timeout_seconds,
get_logs = submit_get_logs
)
kube_submssion.execute(context = None)
The execute method is expecting the context as mentioned in the documentation at followig link
https://airflow.apache.org/docs/apache-airflow-providers-cncf-kubernetes/stable/_modules/airflow/providers/cncf/kubernetes/operators/kubernetes_pod.html#KubernetesPodOperator.execute
You can pass the context from **kwargs to the execute method. You can try by passing kwargs to execute method
kube_submssion.execute(context = kwargs)

Related

Python requests remove website from further checking if keyword found

I want to remove the website from further checking process, if my "keyword" on the website got found, so they are not getting still checked multiple times.
How can i do this? Iam still a beginner but i linked you my whole Script at the bottom thanks.
If the keyword "google" got found at the currently checking website, i want to remove this website from further checking.
if "google" in r2.text:
print (bcolors.OKGREEN + "Parameters Found : " +server+ "/" + para1 + "/" + para2 + bcolors.ENDC)
client = server + "," + para1 + "," + para2 + "\n"
f = open('log.txt', 'a')
f.write(client)
f.close()
My whole Script
import os
import sys
from threading import Thread, BoundedSemaphore
from datetime import datetime
import optparse
import requests
import urllib3
os.system("color")
requests.urllib3.disable_warnings()
maxConnections = 10
connection_lock = BoundedSemaphore(maxConnections)
time = datetime.now().time()
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
ENDC = '\033[0m'
def connect(server, para1, para2):
try:
r = requests.request('put', server + para1 + para2, timeout=30, verify=False, headers={'Content-Type':'application/octet-stream'})
r.close()
except Exception as e:
print(e)
r2 = requests.request('get', server + para1 + para2, verify=False, timeout=30, headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'})
r2.close()
if "google" in r2.text:
print (bcolors.OKGREEN + "Parameters Found : " +server+ "/" + para1 + "/" + para2 + bcolors.ENDC)
client = server + "," + para1 + "," + para2 + "\n"
f = open('log.txt', 'a')
f.write(client)
f.close()
def generate_tests(hosts, paras1, paras2):
i = 0
for para1 in paras1:
para1 = para1.strip('\n\r')
for para2 in paras2:
para2 = para2.strip('\n\r')
for host in hosts:
server = host.strip('\n\r')
print (bcolors.OKGREEN + "=" * 60 + bcolors.ENDC)
print (bcolors.BOLD + "Website: " + bcolors.OKBLUE + server + para1 + para2 + bcolors.ENDC)
print (bcolors.BOLD + "Parameter1: " + bcolors.OKBLUE + para1 + bcolors.ENDC)
print (bcolors.BOLD + "Parameter2: " + bcolors.OKBLUE + para2 + bcolors.ENDC)
i += 1
print (bcolors.BOLD + "Attempts: " + bcolors.OKBLUE + str(i) + bcolors.ENDC)
print (bcolors.BOLD + "Time Started: " + bcolors.OKBLUE + str(time) + bcolors.ENDC)
print (bcolors.BOLD + "Time now: " + bcolors.OKBLUE + str(datetime.now().time()) + bcolors.ENDC)
print (bcolors.OKGREEN + "=" * 60 + bcolors.ENDC)
t = Thread(target=connect, args=(server, para1, para2))
t.start()
def read_test_files(hostsfile, paras1file, paras2file):
hosts = open(hostsfile, 'r').readlines()
paras1 = open(paras1file, 'r').readlines()
paras2 = open(paras2file, 'r').readlines()
generate_tests(hosts, paras1, paras2)
def main():
parser = optparse.OptionParser('usage python test.py -H <hosts file> -U <para1 file> -P <para2 file>')
parser.add_option('-H', dest='hostsfile', help="specify host file to test")
parser.add_option('-U', dest='paras1file', help="specify possible parameters1")
parser.add_option('-P', dest='paras2file', help="specify possible parameters2")
(options, args) = parser.parse_args()
if options.hostsfile and options.paras1file and options.paras2file:
hostsfile = options.hostsfile
paras1file = options.paras1file
paras2file = options.paras2file
read_test_files(hostsfile, paras1file, paras2file)
else:
print (parser.usage)
exit(0)
if __name__ == "__main__":
main()
You can create a collection of whitelists, which will be used to save websites that pass the check, and determine whether they are in the whitelist before starting the thread each time, and skip if they exist.
In the code below, the whitelist collection is temporary.
If you want it to take effect every time you start the program, you can save it to a file, read the file every time you start the program, and write it every time it ends.
import os
import sys
from threading import Thread, BoundedSemaphore
from datetime import datetime
import optparse
import requests
import urllib3
os.system("color")
requests.urllib3.disable_warnings()
maxConnections = 10
connection_lock = BoundedSemaphore(maxConnections)
time = datetime.now().time()
white_list = set()
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
ENDC = '\033[0m'
def connect(server, para1, para2):
try:
r = requests.request('put', server + para1 + para2, timeout=30, verify=False,
headers={'Content-Type': 'application/octet-stream'})
r.close()
except Exception as e:
print(e)
r2 = requests.request('get', server + para1 + para2, verify=False, timeout=30, headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'})
r2.close()
if "google" in r2.text:
# add
white_list.add(server)
print(bcolors.OKGREEN + "Parameters Found : " + server + "/" + para1 + "/" + para2 + bcolors.ENDC)
client = server + "," + para1 + "," + para2 + "\n"
f = open('log.txt', 'a')
f.write(client)
f.close()
def generate_tests(hosts, paras1, paras2):
i = 0
for para1 in paras1:
para1 = para1.strip('\n\r')
for para2 in paras2:
para2 = para2.strip('\n\r')
for host in hosts:
server = host.strip('\n\r')
# Determine whether it is in the whitelist
# The `host` in the whitelist has already been subjected to the `strip('\n\r')` operation.
if server in white_list:
continue
print(bcolors.OKGREEN + "=" * 60 + bcolors.ENDC)
print(bcolors.BOLD + "Website: " + bcolors.OKBLUE + server + para1 + para2 + bcolors.ENDC)
print(bcolors.BOLD + "Parameter1: " + bcolors.OKBLUE + para1 + bcolors.ENDC)
print(bcolors.BOLD + "Parameter2: " + bcolors.OKBLUE + para2 + bcolors.ENDC)
i += 1
print(bcolors.BOLD + "Attempts: " + bcolors.OKBLUE + str(i) + bcolors.ENDC)
print(bcolors.BOLD + "Time Started: " + bcolors.OKBLUE + str(time) + bcolors.ENDC)
print(bcolors.BOLD + "Time now: " + bcolors.OKBLUE + str(datetime.now().time()) + bcolors.ENDC)
print(bcolors.OKGREEN + "=" * 60 + bcolors.ENDC)
t = Thread(target=connect, args=(server, para1, para2))
t.start()
t.join()
def read_test_files(hostsfile, paras1file, paras2file):
hosts = open(hostsfile, 'r').readlines()
paras1 = open(paras1file, 'r').readlines()
paras2 = open(paras2file, 'r').readlines()
generate_tests(hosts, paras1, paras2)
def main():
parser = optparse.OptionParser('usage python test.py -H <hosts file> -U <para1 file> -P <para2 file>')
parser.add_option('-H', dest='hostsfile', help="specify host file to test")
parser.add_option('-U', dest='paras1file', help="specify possible parameters1")
parser.add_option('-P', dest='paras2file', help="specify possible parameters2")
(options, args) = parser.parse_args()
if options.hostsfile and options.paras1file and options.paras2file:
hostsfile = options.hostsfile
paras1file = options.paras1file
paras2file = options.paras2file
read_test_files(hostsfile, paras1file, paras2file)
else:
print(parser.usage)
exit(0)
if __name__ == "__main__":
main()

Unintend does not match while it seems absolutely correct

My code throws error I really can't understand why. There it is:
File "alon.py", line 152
fig.write_image("files/table_" + product_name + ".pdf")
^
IndentationError: unindent does not match any outer indentation level
If I remove this line, it works. Can't see how it is unintended. It is under if type(product_data) is dict: statement. On the same level like the last line of code before it. What can cause such a behaviour ?
import MySQLdb
from plotly import graph_objs as go
import numpy as np
import os
from plotly.subplots import make_subplots
from PyPDF2 import PdfFileMerger
from datetime import datetime, timedelta
# Database connect
db = MySQLdb.connect(host="localhost",
user="root",
passwd="abc9110033969",
db="alon")
today = datetime.today().strftime('%Y-%m-%d')
one_week = (datetime.today() - timedelta(days=7)).strftime('%Y-%m-%d')
two_week = (datetime.today() - timedelta(days=14)).strftime('%Y-%m-%d')
three_week = (datetime.today() - timedelta(days=21)).strftime('%Y-%m-%d')
four_week = (datetime.today() - timedelta(days=28)).strftime('%Y-%m-%d')
# Functions
def load_post_views(table, today, one_week, two_week, three_week, four_week):
product_views_dict = dict()
cursor = db.cursor()
cursor.execute(
"SELECT client_id, product_id, referrer, `date`" +
" FROM " + table +
" WHERE `date`>='"+four_week+"'")
social_dict = {
"facebook": 0,
"twitter": 0,
"instagram": 0,
"linkedin": 0,
"pinterest": 0,
"website": 0,
}
for x in range(0, cursor.rowcount):
row = cursor.fetchone()
network = ""
period = ""
client_id = row[0]
product_id = row[1]
referrer = row[2]
date = str(row[3])
email_cursor = db.cursor()
email_cursor.execute("SELECT address FROM c8ty_connections_email WHERE entry_id=" + str(client_id))
email = email_cursor.fetchone()
product_cursor = db.cursor()
product_cursor.execute("SELECT post_title FROM c8ty_posts WHERE id=" + str(product_id))
product_name = product_cursor.fetchone()
# Add client ID key
if client_id not in product_views_dict:
product_views_dict[client_id] = dict()
# Add product ID key to client ID parent key
if product_id not in product_views_dict[client_id]:
product_views_dict[client_id][product_id] = {
today + " - " + one_week: social_dict,
one_week + " - " + two_week: social_dict,
two_week + " - " + three_week: social_dict,
three_week + " - " + four_week: social_dict
}
# Find referrer
if "facebook" in referrer:
network = "facebook"
elif "twitter" in referrer:
network = "twitter"
elif "instagram" in referrer:
network = "instagram"
elif "linkedin" in referrer:
network = "linkedin"
elif "pinterest" in referrer:
network = "pinterest"
else:
network = "website"
# Check view period
if date <= today and date > one_week:
period = today + " - " + one_week
if date <= one_week and date > two_week:
period = one_week + " - " + two_week
if date <= two_week and date > three_week:
period = two_week + " - " + three_week
if date <= three_week and date > four_week:
period = three_week + " - " + four_week
product_views_dict[client_id][product_id][period][network] += 1
product_views_dict[client_id]["email"] = email[0]
product_views_dict[client_id][product_id]["product"] = product_name[0]
return product_views_dict
# Init
product_views_dict = load_post_views("an_product_view", today, one_week, two_week, three_week, four_week)
brochure_views_dict = load_post_views("an_brochure_view", today, one_week, two_week, three_week, four_week)
for clinetID, product_info in product_views_dict.items():
client_email = product_info["email"]
for productID, product_data in product_info.items():
if type(product_data) is dict:
product_name = product_data['product']
table_data = [
[
today + " - " + one_week,
one_week + " - " + two_week,
two_week + " - " + three_week,
three_week + " - " + four_week,
today + " - " + four_week
]
]
for network in ["website", "facebook", "twitter", "instagram", "linkedin", "pinterest"]:
table_data.append([
product_data[today + " - " + one_week][network],
product_data[one_week + " - " + two_week][network],
product_data[two_week + " - " + three_week][network],
product_data[three_week + " - " + four_week][network],
sum([
int(product_data[today + " - " + one_week][network]),
int(product_data[one_week + " - " + two_week][network]),
int(product_data[two_week + " - " + three_week][network]),
int(product_data[three_week + " - " + four_week][network])
])
])
fig = make_subplots(rows=5, cols=2)
# Create product table
fig.add_trace(
go.Table(
header=dict(values=["Period", "Website", "Facebook", "Twitter", "Instagram", "LinkedIn", "Pinterest", "Total"]),
cells=dict(values=table_data)
)
)
# Craete folder if doesn't exist
if not os.path.exists("files"):
os.mkdir("files")
# Write pdf
fig.write_image("files/table_" + product_name + ".pdf")
db.close()
exit()
Check in your source file that you aren't mixing spaces and tabs? It should be consistent, and only one or the other. I recommend spaces to comply with PEP8.

Appending to array using pool

I am trying to scrape data from soccerway.com and checking whether the page is a completed game/game to be played with each instance being written to a seperate csv file. I am running through 10,000 pages and so have written it using Pools. However, I am getting empty lists from the append function and cannot write anything to the csv files.
I tried writing straight to the file instead of list appending however this gave incomplete files
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import uuid
import time
from multiprocessing import Pool
import sys, os
fixturesA = []
linksA = []
statsA = []
def parse(url):
try:
#print(url)
delays = [0.25,0.5,0.75,1]
delay = np.random.choice(delays)
#time.sleep(delay)
#r = requests.get(url)
r = requests.get(url, timeout = 10)
soup = BeautifulSoup(r.content, "html.parser")
teams = soup.findAll('h3', attrs = {'class' : 'thick'})
homeTeam = teams[0].text.strip()
awayTeam = teams[2].text.strip()
middle = teams[1].text.strip()
dds = soup.findAll('dd')
date = dds[1].text.strip()
gameWeek = dds[2].text.strip()
if ':' not in middle:
middle = middle.split(" - ")
homeGoals = 0
awayGoals = 0
homeGoals = middle[0]
try:
awayGoals = middle[1]
except Exception as e:
homeGoals = "-1"
awayGoals = "-1"
matchGoals = int(homeGoals) + int(awayGoals)
if(matchGoals >= 0):
if(int(homeGoals) > 0 and int(awayGoals) > 0):
btts = "y"
else:
btts = "n"
halfTimeScore = dds[4].text.strip().split(" - ")
firstHalfHomeGoals = halfTimeScore[0]
firstHalfAwayConc = halfTimeScore[0]
firstHalfAwayGoals = halfTimeScore[1]
firstHalfHomeConc = halfTimeScore[1]
firstHalfTotalGoals = int(firstHalfHomeGoals) + int(firstHalfAwayGoals)
secondHalfHomeGoals = int(homeGoals) - int(firstHalfHomeGoals)
secondHalfAwayConc = int(homeGoals) - int(firstHalfHomeGoals)
secondHalfAwayGoals = int(awayGoals) - int(firstHalfAwayGoals)
secondHalfHomeConc = int(awayGoals) - int(firstHalfAwayGoals)
secondHalfTotalGoals = matchGoals - firstHalfTotalGoals
homeTeamContainers = soup.findAll('div', attrs = {'class' : 'container left'})
homeTeamStarting = homeTeamContainers[2]
homeTeamBench = homeTeamContainers[3]
homeTeamYellows = len(homeTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/YC.png' })) + len(homeTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/YC.png' }))
homeTeamReds = len(homeTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/RC.png' })) + len(homeTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/RC.png' }))
homeTeamCards = homeTeamYellows + homeTeamReds
awayTeamContainers = soup.findAll('div', attrs = {'class' : 'container right'})
awayTeamStarting = awayTeamContainers[2]
awayTeamBench = awayTeamContainers[3]
awayTeamYellows = len(awayTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/YC.png' })) + len(awayTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/YC.png' }))
awayTeamReds = len(awayTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/RC.png' })) + len(awayTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/RC.png' }))
awayTeamCards = awayTeamYellows + awayTeamReds
matchCards = homeTeamCards + awayTeamCards
try:
iframe = soup.findAll('iframe')
iframeSrc = iframe[1]['src']
url = 'https://us.soccerway.com/' + iframeSrc
c = requests.get(url,timeout = 10)
soupC = BeautifulSoup(c.content, "html.parser")
cornerContainer = soupC.findAll('td', attrs = {'class' : 'legend left value'})
homeCorners = cornerContainer[0].text.strip()
awayCornersConc = homeCorners
cornerContainer = soupC.findAll('td', attrs = {'class' : 'legend right value'})
awayCorners = cornerContainer[0].text.strip()
homeCornersConc = awayCorners
matchCorners = int(homeCorners) + int(awayCorners)
print("Got Score . " + homeTeam + " vs " + awayTeam+" . " + gameWeek )
statsA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + homeGoals + "," + awayGoals + "," + str(matchGoals) + "," + btts + "," + firstHalfHomeGoals + "," + firstHalfHomeConc + "," + firstHalfAwayGoals + "," + firstHalfAwayConc + "," + str(firstHalfTotalGoals) + "," + str(secondHalfHomeGoals) + "," + str(secondHalfHomeConc) + "," + str(secondHalfAwayGoals) + "," + str(secondHalfAwayConc) + "," + str(secondHalfTotalGoals) + "," + str(homeTeamCards) + "," + str(awayTeamCards) + "," + str(matchCards) + "," + homeCorners + "," + awayCorners + "," + homeCornersConc + "," + awayCornersConc + "," + str(matchCorners)+","+dds[0].text.strip() + "\n")
return None
except Exception as e:
print("Got Score no corners. " + homeTeam + " vs " + awayTeam+" . " + gameWeek + " NO FRAME")
statsA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + homeGoals + "," + awayGoals + "," + str(matchGoals) + "," + btts + "," + firstHalfHomeGoals + "," + firstHalfHomeConc + "," + firstHalfAwayGoals + "," + firstHalfAwayConc + "," + str(firstHalfTotalGoals) + "," + str(secondHalfHomeGoals) + "," + str(secondHalfHomeConc) + "," + str(secondHalfAwayGoals) + "," + str(secondHalfAwayConc) + "," + str(secondHalfTotalGoals) + "," + str(homeTeamCards) + "," + str(awayTeamCards) + "," + str(matchCards) + "," + "" + "," + "" + "," + "" + "," + "" + "," + ""+","+dds[0].text.strip() + "\n")
return None
else:
fixturesA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + date + "\n")
linksA.append(url + "\n")
print(homeTeam + " vs " + awayTeam + " at " + middle + " GW:" + gameWeek)
return None
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
linksA.append(url + "\n")
print(url)
return None
stats = open('Statsv2.csv','a',encoding='utf-8')
fixtures = open('fixturesv2.csv','w',encoding='utf-8')
with open('links.txt') as f:
content = f.readlines()
content = [x.strip() for x in content]
links = open('links.txt','w')
if __name__ == '__main__':
start_time = time.time()
p = Pool(20) # Pool tells how many at a time
records = p.map(parse, content)
p.terminate()
p.join()
print("--- %s seconds ---" % (time.time() - start_time))
I assume you are running Windows? Then the answer is that multi-processing in Windows creates copies instead of forks. So you have your main process with the lists and you get your working processes (from pool) with their own separate set of lists.
The workers most likely fill their list correctly, but the lists in the main-process don't get any data and so are staying empty. And the workers do not return anything. So, as you write your files in the main-process, you get empty files.
An easy way to solve this is creating pipes or queues between the main process and the workers to allow communication between the threads. You could also use shared arrays like they are provided by the multiprocessing class, but than you would need to know the length during creation.
see documentation: Multiprocessing
as pointed out by #RaJa you're not actually doing anything the parent/controlling process can see. the easiest is just to return values from the mapped function
for example, parse() could return tuple at the end like:
def parse(url):
# do work
return url, homeTeam, awayTeam, gameWeek, homeGoals, awayGoals # ...
then the parent process can receive the values and do useful things like saving them to a CSV file:
import csv
with Pool(20) as pool:
records = pool.map(parse, content)
with open('stats.csv', 'w') as fd:
out = csv.writer(fd)
out.writerow([
'url', 'hometeam', 'awayteam',
# and the remaining column names for the header
])
out.writerows(records)

how do I find a way to use ssh and rsync

I have troubles using to implement ssh and rsync including a private key in python, including Popen (subprocess).
Basically the rsync syntax to use should be:
$ rsync -az -e --log-file=$logdir/logfile.out \
'ssh -e /home/user/.ssh/id_rsa' user#server:/target-directory
What I have is this:
import subprocess
First, I build my logdir path - with variables :
logdir = [basedir + '/' + 'log' + '/' + today + '/' + 'pfdcopy_' \
+ typ + '_' + ts + '.log.txt']
Then I build the target directory:
target= ['jboss' + '#' + targetsvr + ':' + /data']
Finally, I try to run this code
p1 = subprocess.Popen(['rsync', '-az', '--log-file=%s' % \
logdir/logfile.out , '-e', 'ssh', '-i', \
'/home/user/.ssh/id_rsa', target])
It's quite complex, I know, mainly because of the variables, and the quotation marks.
Running this, I get always different syntax errors with p1.
Any help is highly appreciated. Thanks!
edited (08-10-2018):
here is my complete runnable code snippet -
from datetime import datetime
import subprocess
import os
import fnmatch
now = datetime.now()
today = now.strftime("%Y-%m-%d")
ts = now.strftime("%Y-%m-%d-%H-%M-%S")
sign_output_dir = '/Users/fanta4/Documents/python-files/outgoing'
mandator = 'BTV'
formsize = 'A4'
basedir = '/Users/fanta4/Documents'
pdf_to_send = []
targetsvr = 'nas1'
doktyp = (
'WPGEBUEHR', 'WPDURCHFU', 'WPABR', 'WPABRKF', 'WPABRVK', 'WPABRTILG', 'WPABRERTR', 'WPAMIS', 'WPSTREP',
'WPABLAUF', 'WPAVISO', 'WPAUSZUG', 'WPERTRAEG', 'WPSIKTEST', 'WPTRANS', 'WPANSCHAFF', 'KKKONTOMIT', 'KRKURSUEW',
'WPVERLUSTA', 'WPVERLUSTG')
os.chdir(sign_output_dir)
for file in os.listdir(sign_output_dir):
if fnmatch.fnmatch(file, '*.pdf'):
pdf_to_send.append(file)
os.chdir(sign_output_dir)
print('debug: doktyp ist: {}'.format(formsize))
for typ in doktyp:
if typ in str(pdf_to_send):
ts = now.strftime("%Y-%m-%d-%Hh-%Mm-%Ss")
print('typ: {:12s} exists -> will be transfered to nas1'.format(typ))
logdir = [basedir + '/' + 'log' + '/' + mandator + '/' + today + '/' + 'pfdcopy_' + typ + '_' + ts + '.log.txt']
target = ['jboss' + '#' + targetsvr + '/data' + '/' + mandator + typ]
p1 = subprocess.Popen(
['rsync', '-az', '--log-file=%s' % logdir, '-e', 'ssh', '-i', '/Users/fanta4/.ssh/id_rsa', typ, '-l', target])
p1.returncode
if p1 > 0:
print('debug: Error with rsync of typ: {} to target: {}'.format(typ, targetsvr))
else:
print('debug: rsync mandator: {:3s} with typ: {:12s} succeeded'.format(mandator, typ))
else:
print('debug: typ: {:12s} does not exist'.format(typ))
logfile = ['/data' + '/' + 'log' + '/' + mandator + '/' + ts]
print('debug: pls see logfile in: {}'.format(logfile))
If I run this code, I get:
/Users/fanta4/anaconda3/bin/python "/Users/fanta4/Library/Mobile Documents/com~apple~CloudDocs/entw/python/prog/rsync-test.py"
Traceback (most recent call last):
/Users/fanta4/Documents/python-files/outgoing
debug: doktyp ist: A4
File "/Users/fanta4/Library/Mobile
Documents/com~apple~CloudDocs/entw/python/prog/rsync-test.py", line 37, in <module>
typ: WPGEBUEHR exists -> will be transfered to nas1
['rsync', '-az', '--log-file=%s' % logdir, '-e', 'ssh', '-i', '/Users/fanta4/.ssh/id_rsa', typ, '-l', target])
File "/Users/fanta4/anaconda3/lib/python3.6/subprocess.py", line 709, in __init__
restore_signals, start_new_session)
File "/Users/fanta4/anaconda3/lib/python3.6/subprocess.py", line 1275, in _execute_child
restore_signals, start_new_session, preexec_fn)
TypeError: expected str, bytes or os.PathLike object, not list
Process finished with exit code 1
Your issue is exemplified by the following lines (when used to generate items which are later used as elements inside an argument vector):
logdir = [basedir + '/' + 'log' + '/' + mandator + '/' + today + '/' + 'pfdcopy_' + typ + '_' + ts + '.log.txt']
target = ['jboss' + '#' + targetsvr + '/data' + '/' + mandator + typ]
You're defining logdir and target as lists (with only one string inside them), whereas they need to be strings.
Just take out the square brackets that create a list, and you'll have strings instead:
logdir = basedir + '/log/' + mandator + '/' + today + '/pfdcopy_' + typ + '_' + ts + '.log.txt'
target = 'jboss#' + targetsvr + '/data/' + mandator + typ
You haven't mentioned what syntax errors you are getting. It would really for the benefit of everyone for you to include that information. I am guessing it's missing the missing quotes around one the string parameters.
p1 = subprocess.Popen([
'rsync', '-az', '--log-file=%s' % 'logdir/logfile.out',
'-e', 'ssh', '-i',
'/home/user/.ssh/id_rsa', target
])

Python 3.7: How to get the Windows user Login Time?

I am trying to get the System User's Login Time using Python 3.7. I have tried win32net and platform module for Python but, functions are not defined in platform module and Win32net is not compatible with Python 3 and more. I have tried following code:
import platform
platform.uname()
import platform
os_name = platform.uname()[0].lower()
if os_name == "windows":
get_win_login_time()
elif os_name.endswith("nix"):
get_nix_login_time()
Try These ( install win32com.client and subprocess modules first ):
import win32com.client, time
strComputer = "."
objWMIService = win32com.client.Dispatch("WbemScripting.SWbemLocator")
objSWbemServices = objWMIService.ConnectServer(strComputer,"root\cimv2")
colItems = objSWbemServices.ExecQuery("SELECT * FROM Win32_NetworkLoginProfile")
def Convert_to_human_time(dtmDate):
strDateTime = ""
if dtmDate[4] == 0:
strDateTime = dtmDate[5] + '/'
else:
strDateTime = dtmDate[4] + dtmDate[5] + '/'
if dtmDate[6] == 0:
strDateTime = strDateTime + dtmDate[7] + '/'
else:
strDateTime = strDateTime + dtmDate[6] + dtmDate[7] + '/'
strDateTime = strDateTime + dtmDate[0] + dtmDate[1] + dtmDate[2] + dtmDate[3] + " " + dtmDate[8] + dtmDate[9] + ":" + dtmDate[10] + dtmDate[11] +':' + dtmDate[12] + dtmDate[13]
return strDateTime
for objItem in colItems:
if objItem.Name is not None:
print("Name: " + str(objItem.Name))
if objItem.LastLogon is not None:
print("Last Logon (Normal Format): " + str(objItem.LastLogon))
print("Last Logon (Human Readable Format): " + Convert_to_human_time(objItem.LastLogon))
if objItem.LastLogoff is not None:
print("Last Logoff (Normal Format): " + str(objItem.LastLogoff))
print("Last Logoff (Human Readable Format): " + Convert_to_human_time(objItem.LastLogoff))
if objItem.LogonHours is not None:
print("Logon Hours: " + str(objItem.LogonHours))
if objItem.LogonServer is not None:
print("Logon Server: " + str(objItem.LogonServer))
if objItem.NumberOfLogons is not None:
print("Number Of Logons: " + str(objItem.NumberOfLogons))
Another way :
from subprocess import check_output
import sys
get_result = check_output("wmic netlogin get name, fullname, lastlogon", shell=True, stderr=False)
print(get_result)
clean_result = str(get_result).lstrip("b'").rstrip("'").replace("\\r\\r\\n", "\n").replace('\n\n', '\n').split('\n')[2:-1]
for items in clean_result:
print(items.lstrip().rstrip())
Good Luck ...

Categories

Resources