サンプルプログラム
やってること
ユーザー名と接続元IPアドレスの情報で構成されるテストデータを作成
※接続元IPアドレスはたまに仮想クラッカーのIPアドレスを利用
jubaanomalyの起動・停止
仮想クラッカーのIPの時だけ判定結果を出力
引数により学習動作が異なる。
"string1"の場合
ユーザー名:IPアドレス"の1つの文字列データにして学習させる。
"string2"の場合
ユーザー名"と"IPアドレス"の2つの文字列データにして学習させる。
ip_to_num
ユーザー名"を1つの文字データとし、"IPアドレス"を1つの数値データにして学習させる。
コンフィグファイル(anomaly.json)
num_rulesの重みづけは、以下の理由によるtypeでstrを指定する。
参考URLに以下記載あり。
"num" 与えられた数値をそのまま重みに利用する。
"num" 与えられた数値をそのまま重みに利用する。
"str" 与えられた数値を文字列として扱う。これは、例えばIDなど、数値自体の大きさに意味のないデータに対して利用する。重みは1とする。
ignore_kth_same_pointを有効にする。有効にしないとscoreがinfになってしまう為。
参考URLに以下記載あり。
登録できる重複データの件数を nearest_neighbor_num - 1 件に制限することにより、スコアが inf になることを防ぐ。 このパラメタは省略可能であり、デフォルト値は false (無効) である。 (Boolean)
{ "method" : "lof", "parameter" : { "nearest_neighbor_num" : 10, "reverse_nearest_neighbor_num" : 30, "method" : "euclid_lsh", "ignore_kth_same_point" : true, "parameter" : { "hash_num" : 8, "table_num" : 16, "probe_num" : 64, "bin_width" : 10, "seed" : 1234 } }, "converter" : { "string_filter_types": {}, "string_filter_rules": [], "num_filter_types": {}, "num_filter_rules": [], "string_types": {}, "string_rules": [{"key":"*", "type":"str", "global_weight" : "bin", "sample_weight" : "bin"}], "num_types": {}, "num_rules": [{"key" : "*", "type" : "str"}] } }
実行プログラム(anomaly_test.py)
#!/usr/bin/env python # -*- coding: utf-8 -*- import signal import sys import os import json from jubatus.anomaly import client from jubatus.common import Datum import random import time import ipaddr TNAMES = ["string1", "string2", "ip_to_num"] NUM_OF_USER = 1000 NUM_OF_DATA = 10000 HUSEI = 1000 # 自分のサーバに不正ログイン施行回数の多いサーバのIP CRACKERS = ["183.60.122.126", "188.140.127.155", "58.186.158.62", "91.224.160.184", "79.169.110.246", "46.186.241.89"] def make_users(): users = list() for i in range(NUM_OF_USER): users.append("user" + str(i)) return users def make_ip(): num = random.randint(1, HUSEI) if num == HUSEI: ip = CRACKERS[random.randint(0, len(CRACKERS) - 1)] else: ip = "10.0.0." + str(random.randint(1, 254)) return ip def make_data(users): data = list() for i in range(NUM_OF_DATA): idx = random.randint(0, NUM_OF_USER - 1) ip = make_ip() data.append([users[idx], ip]) return data def do_exit(sig, stack): print('You pressed Ctrl+C.') print('Stop running the job.') sys.exit(0) def ip2int(ip): o = map(int, ip.split('.')) res = (16777216 * o[0]) + (65536 * o[1]) + (256 * o[2]) + o[3] return res def exec_test(tname, data): cnt = 1 proc_per = 10 cur = 0 stime = time.time() for ent in data: datum = Datum() if tname == "string1": string = ent[0] + ":" + str(ent[1]) datum.add_string("string", string) elif tname == "string2": datum.add_string("user", ent[0]) datum.add_string("src_ip", ent[1]) elif tname == "ip_to_num": datum.add_string("user", ent[0]) ipnum = ip2int(ent[1]) datum.add_number("ip", ipnum) anom = client.Anomaly("127.0.0.1", 9199, tname) ret = anom.add(datum) if ent[1] in CRACKERS: print (ret, ent) if not cnt % (NUM_OF_DATA / proc_per): cur += proc_per etime = int(time.time() - stime) print("===>" + str(cur) + "% completed (elapse: " + str(etime) + " sec)") cnt += 1 def op_srv(OP): if OP[0] == "start": com = "jubaanomaly -f " + OP[1] + "> /dev/null 2>&1 &" else: com = "pkill jubaanomaly > /dev/null 2>&1" if os.system(com): print("Error: jubaanomaly " + OP[0] + " failed") exit(1) def get_args(): if len(sys.argv) != 3: print("Error: Invalid args") exit(1) config = sys.argv[1] tname = sys.argv[2] if not os.path.isfile(config): print("Error: " + config + " does not exist") exit(1) if not tname in TNAMES: print("Error: " + tname + " is invalid test name") exit(1) return config, tname def main(): signal.signal(signal.SIGINT, do_exit) config, tname = get_args() users = make_users() print("make users complete") data = make_data(users) print("make data complete") op_srv(["start", config]) time.sleep(5) print("Test Started") exec_test(tname, data) op_srv(["stop"]) print("Test Finished") if __name__ == '__main__': main()
実行方法&結果
./anomaly_test.py ./anomaly.json "string1" make users complete make data complete Test Started (id_with_score{id: 34, score: 1.00283694267}, ['user0', '58.186.158.62']) (id_with_score{id: 223, score: 1.00143611431}, ['user80', '183.60.122.126']) ===>10% completed (elapse: 0 sec) (id_with_score{id: 290, score: 0.999702095985}, ['user61', '91.224.160.184']) (id_with_score{id: 294, score: 0.999072134495}, ['user13', '79.169.110.246']) (id_with_score{id: 343, score: 0.99412637949}, ['user50', '58.186.158.62']) (id_with_score{id: 405, score: 1.00214076042}, ['user12', '58.186.158.62']) (id_with_score{id: 472, score: 1.0037945509}, ['user90', '183.60.122.126']) (id_with_score{id: 486, score: 1.00199890137}, ['user19', '91.224.160.184']) ===>20% completed (elapse: 2 sec) (id_with_score{id: 570, score: 1.00471949577}, ['user94', '58.186.158.62']) (id_with_score{id: 673, score: 0.996484994888}, ['user90', '188.140.127.155']) (id_with_score{id: 709, score: 1.00101566315}, ['user0', '183.60.122.126']) ===>30% completed (elapse: 5 sec) (id_with_score{id: 852, score: 1.01610195637}, ['user35', '183.60.122.126']) (id_with_score{id: 977, score: 0.999998986721}, ['user19', '46.186.241.89']) ===>40% completed (elapse: 10 sec) (id_with_score{id: 1053, score: 1.01017296314}, ['user99', '58.186.158.62']) (id_with_score{id: 1184, score: 0.998991131783}, ['user57', '46.186.241.89']) (id_with_score{id: 1229, score: 0.997941493988}, ['user96', '183.60.122.126']) ===>50% completed (elapse: 15 sec) ===>60% completed (elapse: 22 sec) ===>70% completed (elapse: 29 sec) (id_with_score{id: 1783, score: 1.00462818146}, ['user1', '46.186.241.89']) ===>80% completed (elapse: 37 sec) (id_with_score{id: 2013, score: 1.00843286514}, ['user64', '79.169.110.246']) (id_with_score{id: 2018, score: 1.00108575821}, ['user4', '58.186.158.62']) (id_with_score{id: 2053, score: 1.00647413731}, ['user72', '188.140.127.155']) (id_with_score{id: 2227, score: 1.00535583496}, ['user3', '91.224.160.184']) (id_with_score{id: 2231, score: 1.0007673502}, ['user16', '46.186.241.89']) ===>90% completed (elapse: 46 sec) (id_with_score{id: 2353, score: 0.995516657829}, ['user97', '58.186.158.62']) (id_with_score{id: 2421, score: 0.9998447299}, ['user94', '188.140.127.155']) (id_with_score{id: 2475, score: 0.992462217808}, ['user87', '183.60.122.126']) ===>100% completed (elapse: 57 sec) Test Finished
# ./anomaly_test.py ./anomaly.json "string2" make users complete make data complete Test Started (id_with_score{id: 38, score: 1.00619769096}, ['user4', '91.224.160.184']) (id_with_score{id: 131, score: 0.9985871315}, ['user85', '188.140.127.155']) (id_with_score{id: 176, score: 1.06174123287}, ['user15', '91.224.160.184']) ===>10% completed (elapse: 0 sec) (id_with_score{id: 308, score: 1.01850771904}, ['user79', '46.186.241.89']) (id_with_score{id: 442, score: 1.02138268948}, ['user53', '58.186.158.62']) ===>20% completed (elapse: 2 sec) (id_with_score{id: 728, score: 1.00389277935}, ['user87', '46.186.241.89']) ===>30% completed (elapse: 3 sec) (id_with_score{id: 771, score: 0.993154287338}, ['user29', '183.60.122.126']) (id_with_score{id: 822, score: 1.00076854229}, ['user96', '79.169.110.246']) (id_with_score{id: 854, score: 1.01049315929}, ['user94', '183.60.122.126']) (id_with_score{id: 880, score: 1.02746069431}, ['user20', '91.224.160.184']) (id_with_score{id: 903, score: 1.03314602375}, ['user41', '183.60.122.126']) (id_with_score{id: 910, score: 1.00415050983}, ['user67', '79.169.110.246']) ===>40% completed (elapse: 6 sec) (id_with_score{id: 1006, score: 1.01191151142}, ['user84', '91.224.160.184']) (id_with_score{id: 1175, score: 1.01555621624}, ['user93', '91.224.160.184']) (id_with_score{id: 1243, score: 0.986846208572}, ['user96', '79.169.110.246']) ===>50% completed (elapse: 9 sec) (id_with_score{id: 1290, score: 0.999491155148}, ['user84', '91.224.160.184']) (id_with_score{id: 1363, score: 1.0839984417}, ['user95', '183.60.122.126']) (id_with_score{id: 1435, score: 0.988385975361}, ['user71', '79.169.110.246']) (id_with_score{id: 1451, score: 1.07448995113}, ['user50', '46.186.241.89']) ===>60% completed (elapse: 13 sec) (id_with_score{id: 1506, score: 1.01763594151}, ['user22', '46.186.241.89']) (id_with_score{id: 1659, score: 1.02214670181}, ['user27', '58.186.158.62']) ===>70% completed (elapse: 17 sec) (id_with_score{id: 1919, score: 1.01828491688}, ['user27', '79.169.110.246']) (id_with_score{id: 1944, score: 1.01462376118}, ['user30', '183.60.122.126']) ===>80% completed (elapse: 22 sec) (id_with_score{id: 2030, score: 1.01424443722}, ['user30', '188.140.127.155']) (id_with_score{id: 2203, score: 1.05265760422}, ['user54', '91.224.160.184']) ===>90% completed (elapse: 28 sec) (id_with_score{id: 2276, score: 0.991128385067}, ['user79', '46.186.241.89']) (id_with_score{id: 2305, score: 0.999580144882}, ['user87', '188.140.127.155']) (id_with_score{id: 2308, score: 1.02546048164}, ['user23', '183.60.122.126']) (id_with_score{id: 2403, score: 0.996041715145}, ['user55', '183.60.122.126']) ===>100% completed (elapse: 34 sec) Test Finished
# ./anomaly_test.py ./anomaly.json "ip_to_num" make users complete make data complete Test Started (id_with_score{id: 11, score: 1.40382528305}, ['user79', '183.60.122.126']) (id_with_score{id: 96, score: 1.65164899826}, ['user30', '79.169.110.246']) ===>10% completed (elapse: 0 sec) (id_with_score{id: 302, score: 1.38489890099}, ['user77', '183.60.122.126']) (id_with_score{id: 304, score: 1.52720057964}, ['user16', '183.60.122.126']) (id_with_score{id: 334, score: 1.37992143631}, ['user58', '58.186.158.62']) (id_with_score{id: 339, score: 1.53620314598}, ['user7', '58.186.158.62']) (id_with_score{id: 375, score: 1.511734128}, ['user27', '91.224.160.184']) ===>20% completed (elapse: 2 sec) (id_with_score{id: 524, score: 1.37163031101}, ['user22', '183.60.122.126']) ===>30% completed (elapse: 3 sec) (id_with_score{id: 880, score: 1.40898621082}, ['user93', '58.186.158.62']) (id_with_score{id: 891, score: 1.32244873047}, ['user49', '79.169.110.246']) ===>40% completed (elapse: 5 sec) (id_with_score{id: 1217, score: 1.43259418011}, ['user20', '183.60.122.126']) (id_with_score{id: 1242, score: 1.44561052322}, ['user98', '91.224.160.184']) ===>50% completed (elapse: 6 sec) (id_with_score{id: 1300, score: 1.69272983074}, ['user14', '46.186.241.89']) (id_with_score{id: 1382, score: 1.36963653564}, ['user78', '79.169.110.246']) (id_with_score{id: 1426, score: 1.37384569645}, ['user77', '46.186.241.89']) ===>60% completed (elapse: 7 sec) (id_with_score{id: 1547, score: 1.37469255924}, ['user32', '188.140.127.155']) (id_with_score{id: 1746, score: 1.34071433544}, ['user80', '46.186.241.89']) (id_with_score{id: 1749, score: 1.764798522}, ['user18', '91.224.160.184']) ===>70% completed (elapse: 7 sec) (id_with_score{id: 1827, score: 1.44498622417}, ['user76', '183.60.122.126']) (id_with_score{id: 1925, score: 1.32021319866}, ['user17', '58.186.158.62']) ===>80% completed (elapse: 8 sec) (id_with_score{id: 2104, score: 1.12577271461}, ['user68', '188.140.127.155']) (id_with_score{id: 2128, score: 1.35998618603}, ['user53', '58.186.158.62']) (id_with_score{id: 2192, score: 1.13407361507}, ['user31', '79.169.110.246']) (id_with_score{id: 2211, score: 1.47604465485}, ['user29', '188.140.127.155']) ===>90% completed (elapse: 9 sec) (id_with_score{id: 2292, score: 1.1355766058}, ['user61', '91.224.160.184']) (id_with_score{id: 2305, score: 1.38899683952}, ['user81', '58.186.158.62']) (id_with_score{id: 2349, score: 1.21521854401}, ['user15', '58.186.158.62']) (id_with_score{id: 2385, score: 1.15831208229}, ['user61', '183.60.122.126']) (id_with_score{id: 2408, score: 1.55122351646}, ['user12', '46.186.241.89']) (id_with_score{id: 2482, score: 1.24194133282}, ['user17', '46.186.241.89']) ===>100% completed (elapse: 9 sec) Test Finished
分かったこと
テキストデータの違いは重みづけが弱い?
学習済みデータが多くなればなるほど、学習時間がかかる?