大変だった。。。待機が重要。
# -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.support.ui import Select from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions from selenium.webdriver.common.by import By from bs4 import BeautifulSoup import time RETRY = 5 def select_by_xpath(wd, xpath, text): # nameで指定 em = wd.find_element_by_xpath(xpath) # selectタグの値をテキストで指定 em = Select(em) # optionsでselect選択肢を取得可能 #for i in em.options: # print(i.text) em.select_by_visible_text(text) def click_by_xpath(wd, xpath): # elementがクリック可能になるまで待つ必要あり。そうしないと以下エラーが不定期に出る。 # selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document # http://www.seleniumqref.com/api/python/conditions/Python_presence_of_element_located.html # 結局waitとループで対処 cnt = 1 wait = WebDriverWait(wd, 30) while True: if cnt >= RETRY: print('Error: click_by_xpath failed: ' + wpath) exit() try: em = wait.until(expected_conditions.visibility_of_element_located((By.XPATH, xpath))) em = wait.until(expected_conditions.element_to_be_clickable((By.XPATH, xpath))) em.click() break except: print('Warn: click_by_xpath retryed: ' + xpath) cnt += 1 time.sleep(0.5) def click_by_xpath_with_scroll(wd, xpath): # 読み込み失敗したらスクロールダウン cnt = 1 wait = WebDriverWait(wd, 30) while True: if cnt >= RETRY: print('Error: click_by_xpath_with_scroll failed: ' + wpath) exit() try: em = wait.until(expected_conditions.visibility_of_element_located((By.XPATH, xpath))) em = wait.until(expected_conditions.element_to_be_clickable((By.XPATH, xpath))) em.click() break except: print('Warn: click_by_xpath_with_scroll retryed: ' + xpath) wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") cnt += 1 time.sleep(0.5) def get_last_page(wd): time.sleep(5) xpath = '//a[@ng-click="setCurrent(pageNumber)" and @class="ng-binding"]' em = wd.find_elements_by_xpath(xpath) return em[-1].text def scraping(): options = webdriver.ChromeOptions() # options.add_argument('--headless') # options.add_argument('--window-size=1280,1280') wd = webdriver.Chrome(options=options) wd.implicitly_wait(20) wd.get('https://access.redhat.com/errata') # プロダクト選択 # ボタンを押してからでないと、選択肢を選べない click_by_xpath(wd, '//div[@class="col-sm-3"]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]') click_by_xpath(wd, '//li[text()="Red Hat Enterprise Linux"]') # OS選択 click_by_xpath(wd, '//div[@class="col-sm-3 chosen-wrapper" and contains(@ng-hide, "portal_product_variant")]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]') click_by_xpath(wd, '//li[text()="Red Hat Enterprise Linux Server"]') # バージョン選択 click_by_xpath(wd, '//div[@class="col-sm-3 chosen-wrapper" and contains(@ng-hide, "portal_product_version")]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]') click_by_xpath(wd, '//li[text()="6"]') # アーキテクチャ選択 click_by_xpath(wd, '//div[@class="col-sm-3 chosen-wrapper" and contains(@ng-hide, "portal_architecture")]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]') click_by_xpath(wd, '//li[text()="x86_64"]') # 表示行を最大に select_by_xpath(wd, '//select[@ng-model="pageSize"]', '100') # 最後のページ数を取得 last_page = get_last_page(wd) # テーブル情報取得 for i in range(int(last_page)): i = i + 1 xpath = '//a[@ng-click="setCurrent(pageNumber)" and @class="ng-binding" and text()="' + str(i) + '"]' click_by_xpath_with_scroll(wd, xpath) html = wd.page_source bs = BeautifulSoup(html, 'html.parser') table = bs.findAll('table', {'id': 'DataTables_Table_0'})[0] rows = table.findAll("tr") for row in rows: for cell in row.findAll('span', {"class": "cell-content"}): print(cell.get_text()) time.sleep(1) wd.quit() if __name__ == '__main__': scraping()