#
# main 6_plsa.R
#
# Make : 2010.12.08
# Modify : 2010.12.16
# Author : Yoshimura. T
#
main = function (bin, K, Itr, Rep, every, s1, s2) {
init = function (x, N, M, K, s1, s2) {
p_z = rep (1 / K, K)
p_dz = matrix (0, K, N)
p_wz = matrix (0, K, M)
p_zdw = list ()
for (z in 1:K) {
p_zdw[[z]] = matrix (0, N, M)
rownames (p_zdw[[z]]) = rownames (x)
colnames (p_zdw[[z]]) = colnames (x)
p_dz[z, ] = rand_beta (N, s1, s2)
p_wz[z, ] = rand_beta (M, s1, s2)
}
p = list (p_z, p_dz, p_wz, p_zdw)
names (p) = c ("z", "dz", "wz", "zdw")
return (p)
}
init_mini = function (N, M, K) {
z = rep (0, K)
dz = matrix (0, K, N)
wz = matrix (0, K, M)
x = list (z, dz, wz)
names (x) = c ("z", "dz", "wz")
return (x)
}
rand_beta = function (num, s1, s2) {
# s1 = s2 = 1.0 の時は一様分布となる
x = rbeta (num, s1, s2)
return (x / sum (x))
}
judge = function (x) {
# 尤度が減少すれば、反復を止める
if (x <= 0.0) {
return (2)
}
# 尤度の変化が少なくなれば、温度を上げる
if (x < 100.0) {
return (1)
}
# 尤度の変化が大きければ、温度は変えない
return (0)
}
vsm = function (x) {
size = dim (x)[2]
# ベクトルの大きさを求める
norm = sqrt (apply (x ** 2, 2, sum))
# 類似度行列の初期化
similar = matrix (0, nrow = size, ncol = size)
for (i in 1:size) {
for (j in 1:size) {
# ベクトルの内積を求める
inner = sum (x[, i] * x[, j])
# 類似度 = cosθ を求める
similar[i, j] = inner / (norm[i] * norm[j])
}
}
return (similar)
}
printout = function (x, L, out, th, rank = 10) {
cat (file = out, append = FALSE)
for(i in 1:dim (x)[1]) {
cat (" ########", i, "######## \n", file = out, append = TRUE)
o = order (x[, i], decreasing = TRUE)
o = replace (o, c (rank:dim (x)[1]), c (0))
x[o[th[i]:rank], i] = 0
s = sprintf ("%-40s %.6f\n", colnames (x)[o], x[o, i])
cat (s, "\n", file = out, append = TRUE, sep = "")
}
s = sprintf ("対数尤度 = %.2f", L)
cat (s, file = out, append = TRUE)
}
threthold = function (x) {
size = dim (x)[2]
th = NULL
for (j in 1:size) {
mdif = 0.1
max = dim (x)[2]
for (i in 2:(size - 1)) {
dif = x[i, j] - x[i + 1, j]
if (dif > mdif) {
mdif = dif
max = i + 1
}
}
th = append (th, max)
}
return (th)
}
# 引数のエラー処理
if (K < 2 || Itr < 1 || Rep < 1 || s1 < 0 || s2 < 0) {
cat ("Please input larger numbers.\n")
return (-1)
}
# 作業用ディレクトリの設定
setwd ("C:/cygwin/home/吉村 建慶/Research/main/result")
# 入出力指定
input = "3_matrix/matrix.csv"
s = sprintf ("%d %d %.1f %.1f", K, Itr, s1, s2)
outdir = paste ("6_plsa", s, sep = "/")
data = t (read.csv (input, row.name = 1))
N = dim (data)[1]
M = dim (data)[2]
eta = 0.95
for (j in 1:Rep) {
# 初期化
B = 1
P = init (data, N, M, K, s1, s2)
for (i in 1:Itr) {
numer = init_mini (N, M, K)
# E ステップ
for (d in 1:N) {
for (w in 1:M) {
denom = 0
for (z in 1:K) {
denom = denom + P$z[z] * (P$dz[z, d] * P$wz[z, w]) ^ B
}
for (z in 1:K) {
P$zdw[[z]][d, w] = P$z[z] * (P$dz[z, d] * P$wz[z, w]) ^ B / denom
# 用語が出現していないなら計算の意味が無いので飛ばす
if (!data[d, w]) { next }
temp = data[d, w] * P$zdw[[z]][d, w]
numer$z[z] = numer$z[z] + temp
numer$dz[z, d] = numer$dz[z, d] + temp
numer$wz[z, w] = numer$wz[z, w] + temp
}
}
}
# M ステップ
P$z = numer$z / sum (numer$z)
P$dz = numer$dz / apply (numer$dz, 1, sum)
P$wz = numer$wz / apply (numer$wz, 1, sum)
# 尤度の計算
L = 0
for (d in 1:N) {
for (w in 1:M) {
if (!data[d, w]) { next }
Pdw = 0
for (z in 1:K) {
Pdw = Pdw + P$z[z] * P$dz[z, d] * P$wz[z, w]
}
L = L + log (Pdw) * data[d, w]
}
}
# 結果の出力
if (every > 0 || i == Itr) {
similar = vsm (P$dz)
rownames (similar) = rownames (data)
colnames (similar) = rownames (data)
if (bin) {
temp = apply (similar, 2, sort, decreasing = TRUE)
th = threthold (temp)
} else {
th = rep (dim (similar)[2], dim (similar)[2])
}
s = sprintf ("%s/Pdz_vsm_%02d-%02d.txt", outdir, j, i)
printout (similar, L, s, th)
}
if (i > 1) {
dif = L - oldL
# 状況表示
s = sprintf ("%02d-%02d : B = %.2f dL = %.6f\n", j, i, B, dif)
cat (s)
# 収束判定
temp = judge (dif)
if (temp == 2) { break }
else if (temp == 1) { B = B * eta }
} else {
s = sprintf ("%02d-01 : B = 1.00 dL = Non\n", j)
cat (s)
}
# 対数尤度の保存
oldL = L
}
}
return (P)
}
# パラメータ(文字型)を整数型または実数型に変換してから関数に与える
args = commandArgs ()
args.int = as.integer (args[6:10])
args.num = as.numeric (args[11:12])
system.time (main (
args.int[1], args.int[2], args.int[3], args.int[4], args.int[5],
args.num[1], args.num[2]
))
### END ###