pikesaku’s blog

個人的な勉強メモです。記載内容について一切の責任は持ちません。

Jubatusで外れ値検知機能(jubaanomaly)の検証

サンプルプログラム

やってること

ユーザー名と接続元IPアドレスの情報で構成されるテストデータを作成
※接続元IPアドレスはたまに仮想クラッカーのIPアドレスを利用

jubaanomalyの起動・停止

仮想クラッカーのIPの時だけ判定結果を出力

引数により学習動作が異なる。

"string1"の場合

ユーザー名:IPアドレス"の1つの文字列データにして学習させる。

"string2"の場合

ユーザー名"と"IPアドレス"の2つの文字列データにして学習させる。

ip_to_num

ユーザー名"を1つの文字データとし、"IPアドレス"を1つの数値データにして学習させる。

コンフィグファイル(anomaly.json)

num_rulesの重みづけは、以下の理由によるtypeでstrを指定する。

参考URLに以下記載あり。

"num" 与えられた数値をそのまま重みに利用する。
"num" 与えられた数値をそのまま重みに利用する。
"str" 与えられた数値を文字列として扱う。これは、例えばIDなど、数値自体の大きさに意味のないデータに対して利用する。重みは1とする。

ignore_kth_same_pointを有効にする。有効にしないとscoreがinfになってしまう為。
参考URLに以下記載あり。

登録できる重複データの件数を nearest_neighbor_num - 1 件に制限することにより、スコアが inf になることを防ぐ。 このパラメタは省略可能であり、デフォルト値は false (無効) である。 (Boolean)

{
 "method" : "lof",
 "parameter" : {
  "nearest_neighbor_num" : 10,
  "reverse_nearest_neighbor_num" : 30,
  "method" : "euclid_lsh",
  "ignore_kth_same_point" : true,
  "parameter" : {
   "hash_num" : 8,
   "table_num" : 16,
   "probe_num" : 64,
   "bin_width" : 10,
   "seed" : 1234
  }
 },

 "converter" : {
  "string_filter_types": {},
  "string_filter_rules": [],
  "num_filter_types": {},
  "num_filter_rules": [],
  "string_types": {},
  "string_rules": [{"key":"*", "type":"str", "global_weight" : "bin", "sample_weight" : "bin"}],
  "num_types": {},
  "num_rules": [{"key" : "*", "type" : "str"}]
 }
}

実行プログラム(anomaly_test.py)

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import signal
import sys
import os
import json
from jubatus.anomaly import client
from jubatus.common import Datum
import random
import time
import ipaddr


TNAMES = ["string1", "string2", "ip_to_num"]
NUM_OF_USER = 1000
NUM_OF_DATA = 10000
HUSEI = 1000


# 自分のサーバに不正ログイン施行回数の多いサーバのIP
CRACKERS = ["183.60.122.126", "188.140.127.155", "58.186.158.62", "91.224.160.184", "79.169.110.246", "46.186.241.89"]


def make_users():
    users = list()
    for i in range(NUM_OF_USER):
        users.append("user" + str(i))
    return users


def make_ip():
    num = random.randint(1, HUSEI)
    if num == HUSEI:
        ip = CRACKERS[random.randint(0, len(CRACKERS) - 1)]
    else:
        ip = "10.0.0." + str(random.randint(1, 254))
    return ip


def make_data(users):
    data = list()
    for i in range(NUM_OF_DATA):
        idx = random.randint(0, NUM_OF_USER - 1)
        ip = make_ip()
        data.append([users[idx], ip])
    return data


def do_exit(sig, stack):
    print('You pressed Ctrl+C.')
    print('Stop running the job.')
    sys.exit(0)


def ip2int(ip):
    o = map(int, ip.split('.'))
    res = (16777216 * o[0]) + (65536 * o[1]) + (256 * o[2]) + o[3]
    return res


def exec_test(tname, data):
    cnt = 1
    proc_per = 10
    cur = 0
    stime = time.time()

    for ent in data:

        datum = Datum()

        if tname == "string1":
            string = ent[0] + ":" + str(ent[1])
            datum.add_string("string", string)
        elif tname == "string2":
            datum.add_string("user", ent[0])
            datum.add_string("src_ip", ent[1])
        elif tname == "ip_to_num":
            datum.add_string("user", ent[0])
            ipnum = ip2int(ent[1])
            datum.add_number("ip", ipnum)

        anom = client.Anomaly("127.0.0.1", 9199, tname)
        ret = anom.add(datum)

        if ent[1] in CRACKERS:
            print (ret, ent)

        if not cnt % (NUM_OF_DATA / proc_per):
            cur += proc_per
            etime = int(time.time() - stime)
            print("===>" + str(cur) + "% completed (elapse: " + str(etime) + " sec)")

        cnt += 1


def op_srv(OP):
    if OP[0] == "start":
        com = "jubaanomaly -f " + OP[1] + "> /dev/null 2>&1 &"
    else:
        com = "pkill jubaanomaly > /dev/null 2>&1"
    if os.system(com):
        print("Error: jubaanomaly " + OP[0] + " failed")
        exit(1)


def get_args():
    if len(sys.argv) != 3:
        print("Error: Invalid args")
        exit(1)

    config = sys.argv[1]
    tname = sys.argv[2]

    if not os.path.isfile(config):
        print("Error: " + config + " does not exist")
        exit(1)
    if not tname in TNAMES:
        print("Error: " + tname + " is invalid test name")
        exit(1)
    return config, tname


def main():
    signal.signal(signal.SIGINT, do_exit)
    config, tname = get_args()

    users = make_users()
    print("make users complete")
    data = make_data(users)
    print("make data complete")

    op_srv(["start", config])
    time.sleep(5)
    print("Test Started")
    exec_test(tname, data)
    op_srv(["stop"])
    print("Test Finished")

if __name__ == '__main__':
    main()

実行方法&結果

./anomaly_test.py ./anomaly.json "string1"
make users complete
make data complete
Test Started
(id_with_score{id: 34, score: 1.00283694267}, ['user0', '58.186.158.62'])
(id_with_score{id: 223, score: 1.00143611431}, ['user80', '183.60.122.126'])
===>10% completed (elapse: 0 sec)
(id_with_score{id: 290, score: 0.999702095985}, ['user61', '91.224.160.184'])
(id_with_score{id: 294, score: 0.999072134495}, ['user13', '79.169.110.246'])
(id_with_score{id: 343, score: 0.99412637949}, ['user50', '58.186.158.62'])
(id_with_score{id: 405, score: 1.00214076042}, ['user12', '58.186.158.62'])
(id_with_score{id: 472, score: 1.0037945509}, ['user90', '183.60.122.126'])
(id_with_score{id: 486, score: 1.00199890137}, ['user19', '91.224.160.184'])
===>20% completed (elapse: 2 sec)
(id_with_score{id: 570, score: 1.00471949577}, ['user94', '58.186.158.62'])
(id_with_score{id: 673, score: 0.996484994888}, ['user90', '188.140.127.155'])
(id_with_score{id: 709, score: 1.00101566315}, ['user0', '183.60.122.126'])
===>30% completed (elapse: 5 sec)
(id_with_score{id: 852, score: 1.01610195637}, ['user35', '183.60.122.126'])
(id_with_score{id: 977, score: 0.999998986721}, ['user19', '46.186.241.89'])
===>40% completed (elapse: 10 sec)
(id_with_score{id: 1053, score: 1.01017296314}, ['user99', '58.186.158.62'])
(id_with_score{id: 1184, score: 0.998991131783}, ['user57', '46.186.241.89'])
(id_with_score{id: 1229, score: 0.997941493988}, ['user96', '183.60.122.126'])
===>50% completed (elapse: 15 sec)
===>60% completed (elapse: 22 sec)
===>70% completed (elapse: 29 sec)
(id_with_score{id: 1783, score: 1.00462818146}, ['user1', '46.186.241.89'])
===>80% completed (elapse: 37 sec)
(id_with_score{id: 2013, score: 1.00843286514}, ['user64', '79.169.110.246'])
(id_with_score{id: 2018, score: 1.00108575821}, ['user4', '58.186.158.62'])
(id_with_score{id: 2053, score: 1.00647413731}, ['user72', '188.140.127.155'])
(id_with_score{id: 2227, score: 1.00535583496}, ['user3', '91.224.160.184'])
(id_with_score{id: 2231, score: 1.0007673502}, ['user16', '46.186.241.89'])
===>90% completed (elapse: 46 sec)
(id_with_score{id: 2353, score: 0.995516657829}, ['user97', '58.186.158.62'])
(id_with_score{id: 2421, score: 0.9998447299}, ['user94', '188.140.127.155'])
(id_with_score{id: 2475, score: 0.992462217808}, ['user87', '183.60.122.126'])
===>100% completed (elapse: 57 sec)
Test Finished
# ./anomaly_test.py ./anomaly.json "string2"
make users complete
make data complete
Test Started
(id_with_score{id: 38, score: 1.00619769096}, ['user4', '91.224.160.184'])
(id_with_score{id: 131, score: 0.9985871315}, ['user85', '188.140.127.155'])
(id_with_score{id: 176, score: 1.06174123287}, ['user15', '91.224.160.184'])
===>10% completed (elapse: 0 sec)
(id_with_score{id: 308, score: 1.01850771904}, ['user79', '46.186.241.89'])
(id_with_score{id: 442, score: 1.02138268948}, ['user53', '58.186.158.62'])
===>20% completed (elapse: 2 sec)
(id_with_score{id: 728, score: 1.00389277935}, ['user87', '46.186.241.89'])
===>30% completed (elapse: 3 sec)
(id_with_score{id: 771, score: 0.993154287338}, ['user29', '183.60.122.126'])
(id_with_score{id: 822, score: 1.00076854229}, ['user96', '79.169.110.246'])
(id_with_score{id: 854, score: 1.01049315929}, ['user94', '183.60.122.126'])
(id_with_score{id: 880, score: 1.02746069431}, ['user20', '91.224.160.184'])
(id_with_score{id: 903, score: 1.03314602375}, ['user41', '183.60.122.126'])
(id_with_score{id: 910, score: 1.00415050983}, ['user67', '79.169.110.246'])
===>40% completed (elapse: 6 sec)
(id_with_score{id: 1006, score: 1.01191151142}, ['user84', '91.224.160.184'])
(id_with_score{id: 1175, score: 1.01555621624}, ['user93', '91.224.160.184'])
(id_with_score{id: 1243, score: 0.986846208572}, ['user96', '79.169.110.246'])
===>50% completed (elapse: 9 sec)
(id_with_score{id: 1290, score: 0.999491155148}, ['user84', '91.224.160.184'])
(id_with_score{id: 1363, score: 1.0839984417}, ['user95', '183.60.122.126'])
(id_with_score{id: 1435, score: 0.988385975361}, ['user71', '79.169.110.246'])
(id_with_score{id: 1451, score: 1.07448995113}, ['user50', '46.186.241.89'])
===>60% completed (elapse: 13 sec)
(id_with_score{id: 1506, score: 1.01763594151}, ['user22', '46.186.241.89'])
(id_with_score{id: 1659, score: 1.02214670181}, ['user27', '58.186.158.62'])
===>70% completed (elapse: 17 sec)
(id_with_score{id: 1919, score: 1.01828491688}, ['user27', '79.169.110.246'])
(id_with_score{id: 1944, score: 1.01462376118}, ['user30', '183.60.122.126'])
===>80% completed (elapse: 22 sec)
(id_with_score{id: 2030, score: 1.01424443722}, ['user30', '188.140.127.155'])
(id_with_score{id: 2203, score: 1.05265760422}, ['user54', '91.224.160.184'])
===>90% completed (elapse: 28 sec)
(id_with_score{id: 2276, score: 0.991128385067}, ['user79', '46.186.241.89'])
(id_with_score{id: 2305, score: 0.999580144882}, ['user87', '188.140.127.155'])
(id_with_score{id: 2308, score: 1.02546048164}, ['user23', '183.60.122.126'])
(id_with_score{id: 2403, score: 0.996041715145}, ['user55', '183.60.122.126'])
===>100% completed (elapse: 34 sec)
Test Finished
# ./anomaly_test.py ./anomaly.json "ip_to_num"
make users complete
make data complete
Test Started
(id_with_score{id: 11, score: 1.40382528305}, ['user79', '183.60.122.126'])
(id_with_score{id: 96, score: 1.65164899826}, ['user30', '79.169.110.246'])
===>10% completed (elapse: 0 sec)
(id_with_score{id: 302, score: 1.38489890099}, ['user77', '183.60.122.126'])
(id_with_score{id: 304, score: 1.52720057964}, ['user16', '183.60.122.126'])
(id_with_score{id: 334, score: 1.37992143631}, ['user58', '58.186.158.62'])
(id_with_score{id: 339, score: 1.53620314598}, ['user7', '58.186.158.62'])
(id_with_score{id: 375, score: 1.511734128}, ['user27', '91.224.160.184'])
===>20% completed (elapse: 2 sec)
(id_with_score{id: 524, score: 1.37163031101}, ['user22', '183.60.122.126'])
===>30% completed (elapse: 3 sec)
(id_with_score{id: 880, score: 1.40898621082}, ['user93', '58.186.158.62'])
(id_with_score{id: 891, score: 1.32244873047}, ['user49', '79.169.110.246'])
===>40% completed (elapse: 5 sec)
(id_with_score{id: 1217, score: 1.43259418011}, ['user20', '183.60.122.126'])
(id_with_score{id: 1242, score: 1.44561052322}, ['user98', '91.224.160.184'])
===>50% completed (elapse: 6 sec)
(id_with_score{id: 1300, score: 1.69272983074}, ['user14', '46.186.241.89'])
(id_with_score{id: 1382, score: 1.36963653564}, ['user78', '79.169.110.246'])
(id_with_score{id: 1426, score: 1.37384569645}, ['user77', '46.186.241.89'])
===>60% completed (elapse: 7 sec)
(id_with_score{id: 1547, score: 1.37469255924}, ['user32', '188.140.127.155'])
(id_with_score{id: 1746, score: 1.34071433544}, ['user80', '46.186.241.89'])
(id_with_score{id: 1749, score: 1.764798522}, ['user18', '91.224.160.184'])
===>70% completed (elapse: 7 sec)
(id_with_score{id: 1827, score: 1.44498622417}, ['user76', '183.60.122.126'])
(id_with_score{id: 1925, score: 1.32021319866}, ['user17', '58.186.158.62'])
===>80% completed (elapse: 8 sec)
(id_with_score{id: 2104, score: 1.12577271461}, ['user68', '188.140.127.155'])
(id_with_score{id: 2128, score: 1.35998618603}, ['user53', '58.186.158.62'])
(id_with_score{id: 2192, score: 1.13407361507}, ['user31', '79.169.110.246'])
(id_with_score{id: 2211, score: 1.47604465485}, ['user29', '188.140.127.155'])
===>90% completed (elapse: 9 sec)
(id_with_score{id: 2292, score: 1.1355766058}, ['user61', '91.224.160.184'])
(id_with_score{id: 2305, score: 1.38899683952}, ['user81', '58.186.158.62'])
(id_with_score{id: 2349, score: 1.21521854401}, ['user15', '58.186.158.62'])
(id_with_score{id: 2385, score: 1.15831208229}, ['user61', '183.60.122.126'])
(id_with_score{id: 2408, score: 1.55122351646}, ['user12', '46.186.241.89'])
(id_with_score{id: 2482, score: 1.24194133282}, ['user17', '46.186.241.89'])
===>100% completed (elapse: 9 sec)
Test Finished

分かったこと

テキストデータの違いは重みづけが弱い?
学習済みデータが多くなればなるほど、学習時間がかかる?