大変だった。。。待機が重要。
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
RETRY = 5
def select_by_xpath(wd, xpath, text):
# nameで指定
em = wd.find_element_by_xpath(xpath)
# selectタグの値をテキストで指定
em = Select(em)
# optionsでselect選択肢を取得可能
#for i in em.options:
# print(i.text)
em.select_by_visible_text(text)
def click_by_xpath(wd, xpath):
# elementがクリック可能になるまで待つ必要あり。そうしないと以下エラーが不定期に出る。
# selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
# http://www.seleniumqref.com/api/python/conditions/Python_presence_of_element_located.html
# 結局waitとループで対処
cnt = 1
wait = WebDriverWait(wd, 30)
while True:
if cnt >= RETRY:
print('Error: click_by_xpath failed: ' + wpath)
exit()
try:
em = wait.until(expected_conditions.visibility_of_element_located((By.XPATH, xpath)))
em = wait.until(expected_conditions.element_to_be_clickable((By.XPATH, xpath)))
em.click()
break
except:
print('Warn: click_by_xpath retryed: ' + xpath)
cnt += 1
time.sleep(0.5)
def click_by_xpath_with_scroll(wd, xpath):
# 読み込み失敗したらスクロールダウン
cnt = 1
wait = WebDriverWait(wd, 30)
while True:
if cnt >= RETRY:
print('Error: click_by_xpath_with_scroll failed: ' + wpath)
exit()
try:
em = wait.until(expected_conditions.visibility_of_element_located((By.XPATH, xpath)))
em = wait.until(expected_conditions.element_to_be_clickable((By.XPATH, xpath)))
em.click()
break
except:
print('Warn: click_by_xpath_with_scroll retryed: ' + xpath)
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
cnt += 1
time.sleep(0.5)
def get_last_page(wd):
time.sleep(5)
xpath = '//a[@ng-click="setCurrent(pageNumber)" and @class="ng-binding"]'
em = wd.find_elements_by_xpath(xpath)
return em[-1].text
def scraping():
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# options.add_argument('--window-size=1280,1280')
wd = webdriver.Chrome(options=options)
wd.implicitly_wait(20)
wd.get('https://access.redhat.com/errata')
# プロダクト選択
# ボタンを押してからでないと、選択肢を選べない
click_by_xpath(wd, '//div[@class="col-sm-3"]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]')
click_by_xpath(wd, '//li[text()="Red Hat Enterprise Linux"]')
# OS選択
click_by_xpath(wd, '//div[@class="col-sm-3 chosen-wrapper" and contains(@ng-hide, "portal_product_variant")]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]')
click_by_xpath(wd, '//li[text()="Red Hat Enterprise Linux Server"]')
# バージョン選択
click_by_xpath(wd, '//div[@class="col-sm-3 chosen-wrapper" and contains(@ng-hide, "portal_product_version")]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]')
click_by_xpath(wd, '//li[text()="6"]')
# アーキテクチャ選択
click_by_xpath(wd, '//div[@class="col-sm-3 chosen-wrapper" and contains(@ng-hide, "portal_architecture")]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]')
click_by_xpath(wd, '//li[text()="x86_64"]')
# 表示行を最大に
select_by_xpath(wd, '//select[@ng-model="pageSize"]', '100')
# 最後のページ数を取得
last_page = get_last_page(wd)
# テーブル情報取得
for i in range(int(last_page)):
i = i + 1
xpath = '//a[@ng-click="setCurrent(pageNumber)" and @class="ng-binding" and text()="' + str(i) + '"]'
click_by_xpath_with_scroll(wd, xpath)
html = wd.page_source
bs = BeautifulSoup(html, 'html.parser')
table = bs.findAll('table', {'id': 'DataTables_Table_0'})[0]
rows = table.findAll("tr")
for row in rows:
for cell in row.findAll('span', {"class": "cell-content"}):
print(cell.get_text())
time.sleep(1)
wd.quit()
if __name__ == '__main__':
scraping()