# -*- coding: utf-8 -*-

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time

RETRY = 5

def select_by_xpath(wd, xpath, text):
    # nameで指定
    em = wd.find_element_by_xpath(xpath)
    # selectタグの値をテキストで指定
    em = Select(em)
    # optionsでselect選択肢を取得可能
    #for i in em.options:
    #    print(i.text)
    em.select_by_visible_text(text)


def click_by_xpath(wd, xpath):
    # elementがクリック可能になるまで待つ必要あり。そうしないと以下エラーが不定期に出る。
    # selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
    # http://www.seleniumqref.com/api/python/conditions/Python_presence_of_element_located.html
    # 結局waitとループで対処
    cnt = 1
    wait = WebDriverWait(wd, 30)
    while True:
        if cnt >= RETRY:
            print('Error: click_by_xpath failed: ' + wpath)
            exit()
        try:
            em = wait.until(expected_conditions.visibility_of_element_located((By.XPATH, xpath)))
            em = wait.until(expected_conditions.element_to_be_clickable((By.XPATH, xpath)))
            em.click()
            break
        except:
            print('Warn: click_by_xpath retryed: ' + xpath)
            cnt += 1
            time.sleep(0.5)


def click_by_xpath_with_scroll(wd, xpath):
    # 読み込み失敗したらスクロールダウン
    cnt = 1
    wait = WebDriverWait(wd, 30)
    while True:
        if cnt >= RETRY:
            print('Error: click_by_xpath_with_scroll failed: ' + wpath)
            exit()
        try:
            em = wait.until(expected_conditions.visibility_of_element_located((By.XPATH, xpath)))
            em = wait.until(expected_conditions.element_to_be_clickable((By.XPATH, xpath)))
            em.click()
            break
        except:
            print('Warn: click_by_xpath_with_scroll retryed: ' + xpath)
            wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            cnt += 1
            time.sleep(0.5)


def get_last_page(wd):
    time.sleep(5)
    xpath = '//a[@ng-click="setCurrent(pageNumber)" and @class="ng-binding"]'
    em = wd.find_elements_by_xpath(xpath)
    return em[-1].text


def scraping():
    options = webdriver.ChromeOptions()
#    options.add_argument('--headless')
#    options.add_argument('--window-size=1280,1280')
    wd = webdriver.Chrome(options=options)
    wd.implicitly_wait(20)
    wd.get('https://access.redhat.com/errata')

    # プロダクト選択
    # ボタンを押してからでないと、選択肢を選べない
    click_by_xpath(wd, '//div[@class="col-sm-3"]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]')
    click_by_xpath(wd, '//li[text()="Red Hat Enterprise Linux"]')

    # OS選択
    click_by_xpath(wd, '//div[@class="col-sm-3 chosen-wrapper" and contains(@ng-hide, "portal_product_variant")]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]')
    click_by_xpath(wd, '//li[text()="Red Hat Enterprise Linux Server"]')

    # バージョン選択
    click_by_xpath(wd, '//div[@class="col-sm-3 chosen-wrapper" and contains(@ng-hide, "portal_product_version")]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]')
    click_by_xpath(wd, '//li[text()="6"]')

    # アーキテクチャ選択
    click_by_xpath(wd, '//div[@class="col-sm-3 chosen-wrapper" and contains(@ng-hide, "portal_architecture")]/div[@class="dropdown more-wrapper"]/button[@id="dropdown-other-filters-portal-product"]')
    click_by_xpath(wd, '//li[text()="x86_64"]')

    # 表示行を最大に
    select_by_xpath(wd, '//select[@ng-model="pageSize"]', '100')

    # 最後のページ数を取得
    last_page = get_last_page(wd)

    # テーブル情報取得
    for i in range(int(last_page)):
        i = i + 1
        xpath = '//a[@ng-click="setCurrent(pageNumber)" and @class="ng-binding" and text()="' + str(i) + '"]'
        click_by_xpath_with_scroll(wd, xpath)
        html = wd.page_source
        bs = BeautifulSoup(html, 'html.parser')
        table = bs.findAll('table', {'id': 'DataTables_Table_0'})[0]
        rows = table.findAll("tr")
        for row in rows:
            for cell in row.findAll('span', {"class": "cell-content"}):
                print(cell.get_text())
        time.sleep(1)
    wd.quit()


if __name__ == '__main__':
    scraping()

2019-04-07

Xpathのパス指定メモ

Web

パス省略(//)は複数使ってもOK?

サンプルコード

# -*- coding: utf-8 -*-
from lxml import html

a = """
<html>
    <head>
        <title>Sample</title>
    </head>
    <body>
        <div>
            <h1 class="title1">Sample #01</h1>
        </div>
        <div>
            <h1 class="title2">Sample #02</h1>
        </div>
        <div>
            <h1 class="title3">Sample #03</h1>
        </div>
    </body>
</html>
"""

print('### フルパス指定')
a = html.fromstring(a)
b = a.xpath("/html/body/div/h1")
for i in b:
  print(i.text)
print('')


print('### フルパス指定+条件付き')
b = a.xpath("/html/body/div/h1[@class='title1']")
for i in b:
  print(i.text)
print('')


print('### 省略パス// 指定')
b = a.xpath("//h1[@class='title1']")
for i in b:
  print(i.text)
print('')


print('### 2重省略パス// 指定')
b = a.xpath("//body//h1[@class='title1']")
for i in b:
  print(i.text)
print('')


print('### ワイルドカード指定')
b = a.xpath("/html/*/*/h1[@class='title1']")
for i in b:
  print(i.text)
print('')


print('### 間違った省略パス指定')
b = a.xpath("//body/h1[@class='title1']")
for i in b:
  print(i.text)
print('')


print('### 間違ったワイルドカード指定')
b = a.xpath("/html/*/h1[@class='title1']")
for i in b:
  print(i.text)

実行結果

### フルパス指定
Sample #01
Sample #02
Sample #03

### フルパス指定+条件付き
Sample #01

### 省略パス// 指定
Sample #01

### 2重省略パス// 指定
Sample #01

### ワイルドカード指定
Sample #01

### 間違った省略パス指定

### 間違ったワイルドカード指定

2019-03-24

XPATHメモ

Web

参考

クローラ作成に必須！XPATHの記法まとめ - Qiita
　

メモ

ChromeならXpath特定可能
f:id:pikesaku:20190331232256p:plain

2019-03-24

Seleniumメモ

Web

参考

seleniumの2つの待機方法 - ストックドッグ
 Selenium API(逆引き)
4. 要素を見つける — Selenium Python Bindings 2 ドキュメント
 Selenium webdriverよく使う操作メソッドまとめ - Qiita
4. Locating Elements — Selenium Python Bindings 2 documentation

要素特定のポイント

大きく2つ方法があり。

①パブリックメソッド
find_element(s)_by_XXXXX(〜)
②プライベートメソッド
find_element(s)(By.XXXXX, 〜)

参考引用

上記のパブリックメソッドとは別に、ページオブジェクト内のロケータで便利なプライベートメソッドが2つあります。これらは、find_elementおよびfind_elementsの2つのプライベートメソッドです。

xpathは重要。xpahtはXMLのデータ特定言語。ID等で対象を特定できない場合に便利。

参考引用

<html>
 <body>
  <form id="loginForm">
   <input name="username" type="text" />
   <input name="password" type="password" />
   <input name="continue" type="submit" value="Login" />
   <input name="continue" type="button" value="Clear" />
  </form>
</body>
<html>

上記のid="loginForm"を指定したい時は以下記述が可能。"/"はツリー指定。"//"はツリー省略記述。

login_form = driver.find_element_by_xpath("/html/body/form[1]")
login_form = driver.find_element_by_xpath("//form[1]")
login_form = driver.find_element_by_xpath("//form[@id='loginForm']")

待機も重要

5. 待機 — Selenium Python Bindings 2 ドキュメント

サンプルコード

JVN iPediaから2018年度のレッドハットのCVSS V3で深刻度が9以上の脆弱性出力

# -*- coding: utf-8 -*-

from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time


def check_by_xpath(wd, xpath):
    em = wd.find_element_by_xpath(xpath)
    em.click()


def select_by_text(wd, select_tag, text):
    # nameで指定
    em = wd.find_element_by_name(select_tag)
    # selectタグの値をテキストで指定
    em = Select(em)
    # optionsでselect選択肢を取得可能
    #for i in em.options:
    #    print(i.text)
    em.select_by_visible_text(text)


def click_by_link_text(wd, text):
    # リンクテキストで指定
    em = wd.find_element_by_link_text(text)
    em.click()


def click_by_name(wd, name):
    # リンクテキストで指定
    em = wd.find_element_by_name(name)
    em.click()


def scraping():
    options = webdriver.ChromeOptions()
#    options.add_argument('--headless')
    wd = webdriver.Chrome(options=options)
    wd.implicitly_wait(20)
    wd.get('https://jvndb.jvn.jp/')
    click_by_link_text(wd, '詳細検索')
    select_by_text(wd, 'vendor', 'レッドハット')
    select_by_text(wd, 'product', 'Red Hat Enterprise Linux Server')
    select_by_text(wd, 'datePublicFromYear', '2018')
    select_by_text(wd, 'datePublicFromMonth', '04')
    select_by_text(wd, 'datePublicToYear', '2019')
    select_by_text(wd, 'datePublicToMonth', '03')
    check_by_xpath(wd, '//input[@class=' + '"cvss_v3" and @value="01"]')
    click_by_name(wd, 'search')
    time.sleep(10)
    wd.quit()


if __name__ == '__main__':
    scraping()

2019-03-11

Metabase使い方メモ

メモ

SQL書いた場合、ドリルダウンできない？

サンプルグラフにSQLで条件追加したらドリルダウン不可

円グラフはドリグダウンできる時と出来ない時の違いがわからない。

サンプルは円グラフでドリルダウン可能
自分で作った場合はNG(SQL未使用)

英語UIでないと列等の詳細設定が動かない？

ダッシュボード公開可能

metabase.com
管理者画面で有効化→ユーザー画面のダッシュボード設定で"Sharing and Embedded"でリンク入手

pikesaku’s blog

個人的な勉強メモです。記載内容について一切の責任は持ちません。

O365メモ

インプレースメールアーカイブ機能とは

ユーザーのアーカイブフォルダとは異なるもの

O365ことはじめ

参考URL

独自ドメイン取得

O365申し込み

O365セットアップ

ログインはここから

Webスクリピングサンプルコード

Xpathのパス指定メモ

サンプルコード

実行結果

XPATHメモ

参考

メモ

Seleniumメモ

参考

要素特定のポイント

大きく2つ方法があり。

xpathは重要。xpahtはXMLのデータ特定言語。ID等で対象を特定できない場合に便利。

待機も重要

サンプルコード

JVN iPediaから2018年度のレッドハットのCVSS V3で深刻度が9以上の脆弱性出力

Metabase使い方メモ

メモ

SQL書いた場合、ドリルダウンできない？

円グラフはドリグダウンできる時と出来ない時の違いがわからない。

英語UIでないと列等の詳細設定が動かない？

ダッシュボード公開可能