一个python selenium的实用实例,比demo重,但也不算太复杂。
trick总结如下:
最新chromedriver的地址,https://googlechromelabs.github.io/chrome-for-testing,这很重要,不然就要处理chrome自动更新之类的烦人问题。很多下载来源都有点过时。
用options配置webdriver的地址,以及socks5代理。
driver.page_source可以打印当前html,可以辅助判断执行进度。
XPath (XML Path Language) 是一门在 XML 文档中通过元素和属性进行导航的语言,属于w3c标准。HTML是标准的XML,所以HTML也可以使用XPath。XPATH选择器的发挥比较稳定。
python有个traceback模块,print(traceback.format_exc())可以原样raise exception的同时,忽略exception。
爬虫项目,遇到了element not interactable,可能是尝试点击的时候元素还未可见。
解决办法就是在 until.elementLocated 之后再添加一个 until.elementIsVisible 的判断。
我这的原因是,网页没有做页面适配,如果窗口太小,按钮是看不见的。所以治本的解决方案是把窗口最大化,上面的判断只能起到问题诊断的作用而非治疗作用。
把sleep都换成until,优雅一点。
# 如果前端页面改了的话,需要修改相应xpath,页面改动大的话需要修改代码 import selenium.common.exceptions from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.wait import WebDriverWait # 根据网页标题、网址以及元素是否存在/可见/可点击等条件来决定我们是否需要继续等待 from selenium.webdriver.support import expected_conditions as EC import configparser import csv from time import sleep # todo 用面向对象的写法,加个test connection这样的 # todo 常量,用户,密码,改到配置文件里 class Spider: def __init__(self): self.config = configparser.ConfigParser() self.config.read('config.ini') options = Options() options.binary_location = self.config['envs']['chrome_driver_location'] options.add_argument('--proxy-server=' + self.config['envs']['proxy']) options.add_argument('--start-maximized') self.driver = webdriver.Chrome(options) def request(self, url): self.driver.get(url) def test_page(self): print(self.driver.page_source) def login(self): wait = WebDriverWait(self.driver, 10) username = self.config['envs']['username'] password = self.config['envs']['password'] xpath_username = self.config['xpath']['username_input'] xpath_password = self.config['xpath']['password_input'] xpath_login = self.config['xpath']['login_button'] username_input = wait.until(EC.presence_of_element_located((By.XPATH, xpath_username))) password_input = wait.until(EC.presence_of_element_located((By.XPATH, xpath_password))) username_input.send_keys(username) password_input.send_keys(password) signin_button = self.driver.find_element(By.XPATH, xpath_login) signin_button.click() def close(self): self.driver.quit() def export_running_job(self): login = False wait = WebDriverWait(self.driver, 10) xpath_table = self.config['xpath']['table_body'] xpath_template = self.config['xpath']['detail_button'] xpath_jobname = self.config['xpath']['jobname_input'] xpath_email = self.config['xpath']['email_input'] xpath_groupid = self.config['xpath']['groupid_input'] xpath_topiclist = self.config['xpath']['topiclist_pre'] xpath_next = self.config['frontend']['next_button'] xpath_close = self.config['xpath']['close_button'] for key in self.config['urls']: url_login = self.config['login'][key] url_flink = self.config['urls'][key] if not login: self.request(url_login) self.login() login = True self.request(url_flink) self.driver.maximize_window() # print(xpath_template) rows = wait.until(EC.presence_of_all_elements_located((By.XPATH, xpath_table))) jobname_ls, email_ls, groupid_ls, topic_ls = [], [], [], [] # mac自带的显示器比较小,所以需要一些操作才能看见按钮 self.driver.set_window_size(1800, 900) self.driver.set_window_position(-700, 10) while True: segments = xpath_template.split('/') for i in range(1, len(rows) + 1): segments[-3] = 'tr[' + str(i) + ']' xpath_item = '/'.join(segments) # print(xpath_item) detail_button = wait.until(EC.visibility_of_element_located((By.XPATH, xpath_item))) detail_button.click() jobname = wait.until(EC.presence_of_element_located((By.XPATH, xpath_jobname))) self.driver.execute_script("arguments[0].removeAttribute('disabled')", jobname) jobname_ls.append(jobname.get_attribute("value")) email = wait.until(EC.presence_of_element_located((By.XPATH, xpath_email))) self.driver.execute_script("arguments[0].removeAttribute('disabled')", email) email_ls.append(email.get_attribute("value")) groupid = wait.until(EC.presence_of_element_located((By.XPATH, xpath_groupid))) self.driver.execute_script("arguments[0].removeAttribute('disabled')", groupid) groupid_ls.append(groupid.get_attribute("value")) topiclist = wait.until(EC.presence_of_element_located((By.XPATH, xpath_topiclist))) topic_ls.append(topiclist.get_attribute("innerHTML")) close_button = wait.until(EC.visibility_of_element_located((By.XPATH, xpath_close))) close_button.click() if len(rows) == int(self.config['frontend']['table_maxsize']): next_button = self.driver.find_element(By.XPATH, xpath_next) next_button.click() else: break with open(key+'.csv', 'wt') as f: cw = csv.writer(f, lineterminator='\n') for i in range(len(groupid_ls)): cw.writerow([jobname_ls[i], email_ls[i], groupid_ls[i], topic_ls[i]]) self.close() if __name__ == '__main__': spider = Spider() spider.export_running_job()