PK!R~phsic/__init__.py__version__ = '0.1.0' PK!K## phsic/app.py#!/usr/bin/env python3 # -*- coding: utf-8 -*- import argparse import json import os import sys from pathlib import Path import dill import numpy as np import phsic import phsic.config import phsic.encoder import phsic.kernel import phsic.phsic """ # naming rule ## raw text - {data_dir}/{data_basename} - e.g., data/nmt/train.en ## feature vector - {out_dir}/feat/{data_basename}.{encoder_config}.vec - e.g., work/nmt/feat/train.en.NormalizedUSE ## model - {out_dir}/model/{data_basename_X}.{kernel_config_X}-{encoder_config_X}.{data_basename_Y}.{kernel_config_Y}-{encoder_config_Y} - e.g., work/nmt/model/train.en.Linear-NormalizedUSE.train.ja.Linear-NormalizedBov-FasttextJa(limit100000) ## score - {out_dir}/score/{data_basename_X}.{kernel_config_X}-{encoder_config_X}.{data_basename_Y}.{kernel_config_Y}-{encoder_config_Y}[.{X_test}.{Y_test}].phsic # - e.g., work/nmt/model/train.en.Linear-NormalizedUSE.train.ja.Linear-NormalizedBov-FasttextJa(limit100000).test.en.test.ja.phsic """ def prepare_feature_vecs(path_to_data, path_to_feat_vec, encoder, encoder_config, limit_words, emb_file): # load if os.path.isfile(path_to_feat_vec): print('Loading feature vectors: {}'.format(path_to_feat_vec), file=sys.stderr) X = dill.load(open(path_to_feat_vec, 'rb')) # or encode else: if encoder[0] is None: encoder[0] = phsic.encoder.encoder(encoder_config, limit_words=limit_words, emb_file=emb_file) X = encoder[0].encode_batch_from_file(path_to_data) print('Dumping feature fectors: {}'.format(path_to_feat_vec), file=sys.stderr) p = Path(path_to_feat_vec) p.parent.mkdir(parents=True, exist_ok=True) with open(path_to_feat_vec, 'wb') as f: dill.dump(X, f) return X def run(args): # modify configs of kernels and encoders # e.g., (Cos, Bov FasttextEn) -> (Linear, NormalizedBov FasttextEn) kernel_config1, encoder_config1 = phsic.config.convert_config_for_phsic( args.kernel1, args.encoder1) kernel_config2, encoder_config2 = phsic.config.convert_config_for_phsic( args.kernel2, args.encoder2) print('kernel_config1: {}'.format(kernel_config1), file=sys.stderr) print('encoder_config1: {}'.format(encoder_config1), file=sys.stderr) print('kernel_config2: {}'.format(kernel_config2), file=sys.stderr) print('encoder_config2: {}'.format(encoder_config2), file=sys.stderr) # . X_train_basename = os.path.basename(args.file_X) Y_train_basename = os.path.basename(args.file_Y) X_train_feat_path = os.path.join(args.out_dir, 'feat', '{data_basename}.{encoder_config}.vec'.format( data_basename=X_train_basename, encoder_config='-'.join( encoder_config1))) Y_train_feat_path = os.path.join(args.out_dir, 'feat', '{data_basename}.{encoder_config}.vec'.format( data_basename=Y_train_basename, encoder_config='-'.join( encoder_config2))) if args.X_test and args.Y_test: X_test_basename = os.path.basename(args.X_test) Y_test_basename = os.path.basename(args.Y_test) X_test_feat_path = os.path.join(args.out_dir, 'feat', '{data_basename}.{encoder_config}.vec'.format( data_basename=X_test_basename, encoder_config='-'.join( encoder_config1))) Y_test_feat_path = os.path.join(args.out_dir, 'feat', '{data_basename}.{encoder_config}.vec'.format( data_basename=Y_test_basename, encoder_config='-'.join( encoder_config2))) # encoders ## encoder1 = None として prepare_feature_vecs() が新しい encoder を返すようにすると, obj1 <- encoder1; obj1 <- encoder2 の参照を維持できない encoder1 = [None] if encoder_config1 == encoder_config2 and args.limit_words1 == args.limit_words2: encoder2 = encoder1 else: encoder2 = [None] # PREPROCESSING: load feature vectors or encode raw texts into feature vectors X = prepare_feature_vecs(args.file_X, X_train_feat_path, encoder1, encoder_config1, args.limit_words1, args.emb1) Y = prepare_feature_vecs(args.file_Y, Y_train_feat_path, encoder2, encoder_config2, args.limit_words2, args.emb2) # kernel function # (repr, repr) -> float kernel1, kernel_batch1 = phsic.kernel.positive_definite_kernel( kernel_config1, X) kernel2, kernel_batch2 = phsic.kernel.positive_definite_kernel( kernel_config2, Y) # fit kernel_type1 = kernel_config1[0] kernel_type2 = kernel_config2[0] if kernel_type1 == 'Linear' and kernel_type2 == 'Linear': model = phsic.phsic.PHSIC() model.fit_XY(X, Y) else: model = phsic.phsic.PHSIC_ICD(args.no_centering) model.fit_XY( X, Y, kernel1, kernel2, args.dim1, args.dim2, k_batch=kernel_batch1, l_batch=kernel_batch2) params_text = '{kernel1}-{encoder1}.{kernel2}-{encoder2}{dim1}{dim2}'.format( kernel1='-'.join(args.kernel1), encoder1='-'.join(args.encoder1), kernel2='-'.join(args.kernel2), encoder2='-'.join(args.encoder2), dim1='' if kernel_type1 == 'Linear' and kernel_type2 == 'Linear' else '.{}'.format( args.dim1), dim2='' if kernel_type1 == 'Linear' and kernel_type2 == 'Linear' else '.{}'.format( args.dim2), ) # predict outpath = '{prefix}.{params}.phsic'.format( prefix=args.out_prefix, params=params_text, ) if args.X_test and args.Y_test: X_test = prepare_feature_vecs(args.X_test, X_test_feat_path, encoder1, encoder_config1, args.limit_words1, args.emb1) Y_test = prepare_feature_vecs(args.Y_test, Y_test_feat_path, encoder2, encoder_config2, args.limit_words2, args.emb2) with open(outpath, 'w') as fo: phsics = model.predict_batch_XY(X_test, Y_test) print('writting output file...', file=sys.stderr) np.savetxt(fo, phsics) else: with open(outpath, 'w') as fo: phsics = model.predict_batch_training_data() print('writting output file...', file=sys.stderr) np.savetxt(fo, phsics) print('written: {}'.format(outpath), file=sys.stderr) print('Done!', file=sys.stderr) def main(): parser = argparse.ArgumentParser( prog='PHSIC', description='Give phsic scores to paird data.') parser.add_argument('file_X') parser.add_argument('file_Y') ## kernel 1 parser.add_argument('--kernel1', nargs='+', help='e.g. Gaussian 1.0', required=True) parser.add_argument('--encoder1', nargs='+', help='e.g., SumBov FasttextEn', required=True) parser.add_argument('--emb1', type=str, required=True) ## kernel 2 parser.add_argument('--kernel2', nargs='+', help='e.g. Gaussian 1.0') parser.add_argument('--encoder2', nargs='+', help='e.g., SumBov FasttextEn') parser.add_argument('--emb2', type=str, required=True) parser.add_argument('--dim1', type=int, required=True) parser.add_argument('--dim2', type=int, required=True) parser.add_argument('--limit_words1', type=int) parser.add_argument('--limit_words2', type=int) parser.add_argument('--no_centering', action='store_true') # default: False parser.add_argument('--out_dir', required=True) parser.add_argument('--out_prefix', required=True) parser.add_argument('--X_test', '--test_file_X') parser.add_argument('--Y_test', '--test_file_Y') parser.add_argument('--version', action='version', version='%(prog)s {}'.format(phsic.__version__)) args = parser.parse_args() print(json.dumps(args.__dict__, indent=2), file=sys.stderr) run(args) if __name__ == '__main__': main() PK!"phsic/config.pydef convert_config_for_phsic(kernel_config, encoder_config): kernel_config_new = kernel_config[:] encoder_config_new = encoder_config[:] kernel_type = kernel_config[0] encoder_type = encoder_config[0] if kernel_type == 'Cos' and encoder_type in {'SumBov', 'AveBov', 'NormalizedSumBov'}: kernel_config_new[0] = 'Linear' encoder_config_new[0] = 'NormalizedSumBov' elif kernel_type == 'Cos' and encoder_type in {'USE', 'NormalizedUSE'}: kernel_config_new[0] = 'Linear' encoder_config_new[0] = 'NormalizedUSE' return kernel_config_new, encoder_config_new PK!8"f(f(phsic/encoder.pyimport sys import gensim import numpy as np import sklearn.preprocessing import tensorflow as tf import tensorflow_hub as hub import phsic.utils # -------------------------------- # EONCODER_CONFIG # -------------------------------- """ - `ENCODER_CONFIG` - `[ENCODER_TYPE]` \# sentence encoder - `[ENCODER_TYPE, WORDVEC_KEY]` \# w/word vector - `ENCODER_TYPE` - w/word vector - `'SumBov'`: sum of bag-of-vectors in given sentence - `'AveBov'`: average of bag-of-vectors in given sentence - `'NormalizedSumBov'`: normalized `'SumBov'` - sentence encoder - `'USE'`: universal sentence encoder w/DAN - `'NormalizedUSE'`: normalized `'USE'` - `WORDVEC_KEY` - `'FasttextCs'` - `'FasttextDe'` - `'FasttextJa'` - `'FasttextEn'` - `'FasttextVi'` - `'FasttextCsBin'` - `'FasttextDeBin'` - `'FasttextJaBin'` - `'FasttextViBin'` """ def encoder(*args, **kwargs): return Encoder(*args, **kwargs) class Encoder: def __init__(self, encoder_config, limit_words=None, eps=1e-8, emb_file=None): self.eps = eps self.encoder_type = encoder_config[0] # load word vector file if self.encoder_type in {'SumBov', 'AveBov', 'NormalizedSumBov'}: wordvec_key = encoder_config[1] self.wv_model = load_wv_from_wordvec_key( wordvec_key, limit_words=limit_words, emb_file=emb_file) # self.sentence_encoder = lambda self, sentence: sum_of_wordvecs(self.wv_model)(sentence) # load sentence encoder elif self.encoder_type in {'USE', 'NormalizedUSE'}: tf_module_url = 'https://tfhub.dev/google/universal-sentence-encoder/1' print('Loading...: {}'.format(tf_module_url), file=sys.stderr) self.sentence_encoder = hub.Module(tf_module_url) def encode(self, sentence): # return self.sentence_encoder(sentence) if isinstance(sentence, (list, tuple)): sentence = ' '.join(sentence) if self.encoder_type == 'SumBov': return self.sum_of_wordvecs(self.wv_model)(sentence) elif self.encoder_type == 'AveBov': return self.ave_of_wordvecs(self.wv_model)(sentence) elif self.encoder_type == 'NormalizedSumBov': return self.normalized_sum_of_wordvecs(self.wv_model)(sentence) def __call__(self, sentence): return self.encode(sentence) def encode_batch_from_file(self, path): print('Encoding sentences in {}'.format(path), file=sys.stderr) return self.encode_batch(open(path)) def encode_batch(self, sentences): """ sentences: iter of sentences Return ---------- iter of sentence embeddings """ if self.encoder_type in {'SumBov', 'AveBov', 'NormalizedSumBov'}: return [self.encode(s) for s in sentences] elif self.encoder_type == 'USE': with tf.Session() as session: session.run([tf.global_variables_initializer(), tf.tables_initializer()]) return session.run(self.sentence_encoder(list(sentences))) elif self.encoder_type == 'NormalizedUSE': with tf.Session() as session: session.run([tf.global_variables_initializer(), tf.tables_initializer()]) return sklearn.preprocessing.normalize( session.run(self.sentence_encoder(list(sentences))), norm='l2') def sum_of_wordvecs(self, wv_model): """ wv_model -> sentence -> vec """ wv_type = type(wv_model) def _sum_of_wordvecs(sentence): vecs = [_to_vec(word.rstrip(",."), wv_model, wv_type) for word in sentence.split() if _in_vocab(word.rstrip(",."), wv_model, wv_type)] if vecs: return sum(vecs) else: return phsic.utils.sample_from_unit_hypersphere( wv_model.vector_size) # return np.zeros((wv_model.vector_size,)) + self.eps return _sum_of_wordvecs def ave_of_wordvecs(self, wv_model): """ wv_model -> sentence -> vec """ wv_type = type(wv_model) def _sum_of_wordvecs(sentence): vecs = [_to_vec(word.rstrip(",.,.、。"), wv_model, wv_type) for word in sentence.split() if _in_vocab(word.rstrip(",.,.、。"), wv_model, wv_type)] if vecs: return np.average(vecs, axis=0) else: return phsic.utils.sample_from_unit_hypersphere( wv_model.vector_size) # return np.zeros((wv_model.vector_size,)) + self.eps return _sum_of_wordvecs def normalized_sum_of_wordvecs(self, wv_model): """ wv_model -> sentence -> vec """ wv_type = type(wv_model) def _normalized_sum_of_wordvecs(sentence): vecs = [_to_vec(word.rstrip(",."), wv_model, wv_type) for word in sentence.split() if _in_vocab(word.rstrip(",."), wv_model, wv_type)] if vecs: vec = sum(vecs) else: vec = np.zeros((wv_model.vector_size,)) + self.eps norm = np.linalg.norm(vec) + self.eps return vec / norm return _normalized_sum_of_wordvecs # -------------------------------- # word vector # -------------------------------- """ - `WORDVEC_FORMAT` - `'word2vec.txt'` - `'word2vec.bin'` - `'fasttext.vec'` - `'fasttext.bin'` - `WV_MODEL` - gensim word2vec/fasttext model - `WV_MODEL_TYPE` - `TYPE_WORD2VEC` - `TYPE_FASTTEXT` """ TYPE_WORD2VEC = gensim.models.keyedvectors.Word2VecKeyedVectors TYPE_FASTTEXT = gensim.models.fasttext.FastText # WORDVEC_KEY: dict(relpath, WORDVEC_FORMAT) WORDVEC_DICT = { # fasttext.bin 'Word2vecEnPrivate': dict( relpath='../data/wordvecs/word2vec/private.en.bin', wordvec_format='word2vec.bin', ), 'Word2vecJaPrivate': dict( relpath='../data/wordvecs/word2vec/private.ja.bin', wordvec_format='word2vec.bin', ), # fasttext.vec 'FasttextCs': dict( relpath='../data/wordvecs/fasttext/cc.cs.300.vec', wordvec_format='fasttext.vec', ), 'FasttextDe': dict( relpath='../data/wordvecs/fasttext/cc.de.300.vec', wordvec_format='fasttext.vec', ), 'FasttextJa': dict( relpath='../data/wordvecs/fasttext/cc.ja.300.vec', wordvec_format='fasttext.vec', ), 'FasttextEn': dict( relpath='../data/wordvecs/fasttext/crawl-300d-2M.vec', wordvec_format='fasttext.vec', ), 'FasttextEnLight': dict( relpath='../data/wordvecs/fasttext/crawl-300d-2M.vec.light', wordvec_format='fasttext.vec', ), 'FasttextVi': dict( relpath='../data/wordvecs/fasttext/cc.vi.300.vec', wordvec_format='fasttext.vec', ), # fasttext.bin 'FasttextCsBin': dict( relpath='../data/wordvecs/fasttext/cc.cs.300.bin', wordvec_format='fasttext.bin', ), 'FasttextDeBin': dict( relpath='../data/wordvecs/fasttext/cc.de.300.bin', wordvec_format='fasttext.bin', ), 'FasttextJaBin': dict( relpath='../data/wordvecs/fasttext/cc.ja.300.bin', wordvec_format='fasttext.bin', ), 'FasttextViBin': dict( relpath='../data/wordvecs/fasttext/cc.vi.300.bin', wordvec_format='fasttext.bin', ), } # load model # limit_words works when wv_model_type is TYPE_WORD2VEC, # that is, wordvec_format is 'fasttext.vec' def load_wv_from_wordvec_key(wordvec_key, limit_words=None, emb_file=None): # limit_words works when wv_model_type is TYPE_WORD2VEC, # that is, wordvec_format is 'fasttext.vec' wordvec_format = WORDVEC_DICT[wordvec_key]['wordvec_format'] return load_wordvec(emb_file, limit_words=limit_words, wordvec_format=wordvec_format) def load_wordvec(abspath_wordvec, limit_words=None, wordvec_format='word2vec'): # ref. https://github.com/RaRe-Technologies/gensim/issues/814 print('Loading...: {} (limit words: {})'.format( abspath_wordvec, limit_words), file=sys.stderr) if wordvec_format == 'fasttext.vec': return gensim.models.KeyedVectors.load_word2vec_format(abspath_wordvec, binary=False, limit=limit_words) elif wordvec_format == 'fasttext.bin': return gensim.models.fasttext.FastText.load_fasttext_format( abspath_wordvec) elif wordvec_format == 'word2vec.bin': return gensim.models.KeyedVectors.load_word2vec_format(abspath_wordvec, binary=True) # utils def _to_vec(word, wv_model, wv_model_type=TYPE_WORD2VEC): if wv_model_type == TYPE_WORD2VEC: return wv_model[word] elif wv_model_type == TYPE_FASTTEXT: return wv_model.wv[word] def _in_vocab(word, wv_model, wv_model_type=TYPE_WORD2VEC): if wv_model_type == TYPE_WORD2VEC: return word in wv_model.vocab # tests if word present in vocab elif wv_model_type == TYPE_FASTTEXT: return word in wv_model # tests if vector present for word # sentence representation # def sum_of_wordvecs(wv_model): # """ # wv_model -> sentence -> vec # """ # wv_model_type = type(wv_model) # def _sum_of_wordvecs(sentence): # return sum(_to_vec(word.rstrip(",."), wv_model, wv_model_type) for word in sentence.split() # if _in_vocab(word.rstrip(",."), wv_model, wv_model_type)) # return _sum_of_wordvecs PK!&Yddphsic/kernel.pyimport functools import sys import numpy as np import scipy.spatial import sklearn.metrics from statsmodels.nonparametric import bandwidths """ - KERNEL_CONFIG: - ['Linear'] - ['Cos'] - ['ReluCos'] - ['Gaussian', sigma] - ['Laplacian', gamma] """ # ----------------------------------------- # positive definite kernels (on R^n) # ----------------------------------------- def positive_definite_kernel(kernel_config, data=None): """ return kernel, kernel_batch kernel_batch: cf. sklearn.metrics - Pairwise metrics, Affinities and Kernels cf. scipy.spatial.distance """ kernel_type = kernel_config[0] if kernel_type == 'Linear': return inner_product, None elif kernel_type == 'Cos': return cosine, None elif kernel_type == 'ReluCos': return relu_cosine, None elif kernel_type == 'Gaussian': bw_param = kernel_config[1] if bw_param == 'scott': if len(kernel_config) > 2: scale = float(kernel_config[2]) sigma_vec = bandwidths.bw_scott(data) * scale else: sigma_vec = bandwidths.bw_scott(data) print('bandwidth: {}'.format(np.average(sigma_vec))) return gaussian(sigma_vec), gaussian_pairwise(sigma_vec) elif bw_param == 'silverman': sigma_vec = bandwidths.bw_silverman(data) print('bandwidth: {}'.format(np.average(sigma_vec))) return gaussian(sigma_vec), gaussian_pairwise(sigma_vec) else: sigma = float(kernel_config[1]) gamma = 1.0 / (2.0 * sigma ** 2) return gaussian(sigma), functools.partial( sklearn.metrics.pairwise.rbf_kernel, gamma=gamma) elif kernel_type == 'Laplacian': gamma = float(kernel_config[1]) return laplacian(gamma), functools.partial( sklearn.metrics.pairwise.laplacian_kernel, gamma=gamma) inner_product = lambda v1, v2: np.dot(v1, v2) cosine = lambda v1, v2: 1.0 - scipy.spatial.distance.cosine(v1, v2) assert cosine(np.array([1, 1]), np.array([1, 1])) - 1.0 < 0.0001 assert cosine(np.array([1, 1]), np.array([1, -1])) - 0.0 < 0.0001 cosine_plus_one = lambda v1, v2: cosine(v1, v2) + 1 relu_cosine = lambda v1, v2: max(cosine(v1, v2), 0) def gaussian(sigma): def _gaussian(v1, v2): if np.array_equal(v1, v2): return 1.0 else: return np.exp(- (scipy.spatial.distance.sqeuclidean(v1 / sigma, v2 / sigma) / 2)) return _gaussian def gaussian_pairwise(sigma_vec): def _gaussian_pairwise(X, Y): K = sklearn.metrics.pairwise.euclidean_distances( X / sigma_vec, Y / sigma_vec, squared=True) K *= -0.5 np.exp(K, K) # exponentiate K in-place return K return _gaussian_pairwise def laplacian(gamma): def _laplacian(v1, v2): if np.array_equal(v1, v2): return 1.0 else: return np.exp(- gamma * np.linalg.norm(v1 - v2, ord=1)) return _laplacian # ----------------------------------------- # handle Gram matrices # ----------------------------------------- def Hprod(A): # A: n x k numpy array # H: n x n numpy array # return H @ A (n x k numpy array) n = A.shape[0] OneTA = np.sum(A, axis=0) # $1^T A$ return A - ((1 / n) * OneTA) # 引き算は各行 (broadcast) A - 1/n 1 1^T A # ----------------------------------------- # incomplete Cholesky decomposition with kernels # ----------------------------------------- def icd_kernel(X, k, d, k_batch=None, verbose=False): """ input - X: sequence of length n - k: positive definite kernel - d: dim of output data (<= n) output: - A: n x d matrix - s.t. A @ A.T ~ K (Gram matrix) A[i] @ A[j] ~ k(X[i],X[j]) - pivot_ids: d-length list consist of subset of range(n), corresponding each column of A - pivot_xs: d-length list consist of subset of X, corresponding each column of A """ n = len(X) assert d <= n A = np.zeros(shape=(n, d)) # todo: カーネルによっては固定の値で初期化できそう. e.g., cosine, Gaussian diag = np.array([k(x, x) for x in X]) assert np.all(diag >= 0.) # todo: set にしないと (list にしちゃうと) remove のオーダーがでかい? remain_ids = list(range(n)) pivot_ids = list() # used ids print(' iter: {}, approx error: {}'.format(0, sum(diag)), file=sys.stderr) for i in range(d): # select pivot (index) p = np.argmax(diag) pivot_ids.append(p) # p == pivot_ids[i] remain_ids.remove(p) A[p, i] = np.sqrt(diag[p]) if k_batch: A[remain_ids, i] = np.array((k_batch(X, X[p].reshape(1, -1)).reshape( (n,)) - A[:, 0:i] @ A[p, 0:i]) / A[p, i])[remain_ids] else: A[remain_ids, i] = np.array( ([k(X[i], X[p]) for i in range(n)] - A[:, 0:i] @ A[p, 0:i]) / A[ p, i])[remain_ids] # A[remain_ids, i] = ([k(X[i],X[p]) for i in remain_ids] - A[remain_ids, 0:i] @ A[p, 0:i]) / A[p, i] diag[remain_ids] -= A[remain_ids, i] ** 2 diag[p] = 0.0 # eliminate selected pivot if i + 1 <= 5 or (i + 1) % 10 == 0: print(' iter: {}, approx error: {}'.format(i + 1, sum(diag)), file=sys.stderr) pivot_xs = [X[i] for i in pivot_ids] print(' pivot ids selected in iteration: {}'.format( pivot_ids), file=sys.stderr) return A, pivot_ids, pivot_xs def icd_for_new_data(A, pivot_ids, pivot_xs, k, x_new): d = A.shape[1] assert len(pivot_ids) == d assert len(pivot_xs) == d a = list() # todo: -> listcomp for i in range(d): a.append((k(x_new, pivot_xs[i]) - a[0:i] @ A[pivot_ids[i], 0:i]) / A[ pivot_ids[i], i]) return np.array(a) PK!&&phsic/phsic.pyimport sys import numpy as np from numpy.core.umath_tests import inner1d import phsic.kernel class PHSIC(): def __init__(self): self.xp = np def fit_XY(self, X, Y): """ params X: sequence of length n consisting of xs; [x_i] ... observed data on X (d1-dim feature vectors) Y: sequence of length n consisting of ys; [y_i] ... observed data on Y (d2-dim feature vectors) saves X : n x d1 matrix Y : n x d2 matrix x_mean: d1-dim array y_mean: d2-dim array CXY : d1 x d2 cross covariance matrix between X and Y """ print('phsic.fit_XY()...', file=sys.stderr) n = len(X) m = len(Y) assert n == m self.X = np.array(X) self.Y = np.array(Y) XTY = np.dot(self.X.T, self.Y) self.x_mean = np.sum(self.X, axis=0) / n # \bar{x}; shape (d,) self.y_mean = np.sum(self.Y, axis=0) / n # \bar{y}; shape (d,) self.CXY = XTY / n - np.outer(self.x_mean, self.y_mean) def predict(self, x, y): """ returns float PHSIC(x,y; X, Y) """ return np.dot(x - self.x_mean, self.CXY @ (y - self.y_mean)) def predict_batch_XY(self, X, Y): """ params X: sequence of length n consisting of xs; [x_i] ... observed data on X Y: sequence of length n consisting of ys; [y_i] ... observed data on Y returns """ print('phsic.predict_batch_XY()...', file=sys.stderr) # return np.dot(X - self.x_mean, self.CXY @ (X - self.y_mean)) return np.array([self.predict(x, y) for x, y in zip(X, Y)]) def predict_batch_training_data(self): print('phsic.predict_batch_training_data()...', file=sys.stderr) X = self.X Y = self.Y return np.array([self.predict(x, y) for x, y in zip(X, Y)]) class PHSIC_ICD(): def __init__(self, no_centering=False): self.xp = np self.no_centering = no_centering def fit(self, Z, k, l, d1, d2): """ Z: sequence of length n consisting of (x,y) pairs; [(x_i, y_i)] ... observed data on X times Y k: function k(x_i, x_j) returns a real number ... positive definite kernel on X l: function k(y_i, y_j) returns a real number ... positive definite kernel on Y d: dimension of ICD on both X and Y side saves A : n x d matrix B : n x d matrix a_mean: d-dim array b_mean: d-dim array C_ICD : d x d matrix """ print('phsic.fit()...', file=sys.stderr) n = len(Z) X, Y = tuple(zip(*Z)) self.fit_XY(X, Y, k, l, d1, d2) def fit_XY(self, X, Y, k, l, d1, d2, k_batch=None, l_batch=None): """ params X: sequence of length n consisting of xs; [x_i] ... observed data on X Y: sequence of length n consisting of ys; [y_i] ... observed data on Y k: function k(x_i, x_j) that returns a real number ... positive definite kernel on X l: function l(y_i, y_j) that returns a real number ... positive definite kernel on Y d1: dimension of ICD on X side d2: dimension of ICD on Y side saves A : n x d1 matrix B : n x d2 matrix a_mean: d1-dim array b_mean: d2-dim array C_ICD : d1 x d2 matrix """ print('phsic.fit_XY()...', file=sys.stderr) n = len(X) m = len(Y) assert n == m # learning print(' incomplete Cholesky decomposition on X side...', file=sys.stderr) self.A, self.pivot_xids, self.pivot_xs = phsic.kernel.icd_kernel( X, k, d1, k_batch=k_batch) print(' incomplete Cholesky decomposition on Y side...', file=sys.stderr) self.B, self.pivot_yids, self.pivot_ys = phsic.kernel.icd_kernel( Y, l, d2, k_batch=l_batch) self.a_mean = np.array(self.A.T @ np.ones(shape=(n))) / n self.b_mean = np.array(self.B.T @ np.ones(shape=(n))) / n # todo: np.ones 2回作らない # todo: このnp.array()必要? self.C_ICD = np.array(phsic.kernel.Hprod(self.A).T @ self.B) / n # no_centering if self.no_centering: self.C_ICD_NC = (self.A.T @ self.B) / n self.k = k self.l = l def predict(self, x, y): """ returns float ... PHSIC(x, y; Z, k, l) """ a = phsic.kernel.icd_for_new_data( self.A, self.pivot_xids, self.pivot_xs, self.k, x) b = phsic.kernel.icd_for_new_data( self.B, self.pivot_yids, self.pivot_ys, self.l, y) if self.no_centering: return a @ (self.C_ICD_NC @ b) else: return (a - self.a_mean) @ (self.C_ICD @ (b - self.b_mean)) def predict_batch_XY(self, X, Y): """ params X: sequence of length n consisting of xs; [x_i] ... observed data on X Y: sequence of length n consisting of ys; [y_i] ... observed data on Y returns """ print('phsic.predict_batch_XY()...', file=sys.stderr) A_new = np.array([phsic.kernel.icd_for_new_data(self.A, self.pivot_xids, self.pivot_xs, self.k, x) for x in X]) B_new = np.array([phsic.kernel.icd_for_new_data(self.B, self.pivot_yids, self.pivot_ys, self.l, y) for y in Y]) if self.no_centering: return inner1d(A_new, (self.C_ICD_NC @ B_new.T).T) else: return inner1d(A_new - self.a_mean, (self.C_ICD @ (B_new - self.b_mean).T).T) # return np.array([self.predict(x, y) for x, y in zip(X, Y)]) def predict_batch_training_data(self): print('phsic.predict_batch_training_data()...', file=sys.stderr) A_new = self.A B_new = self.B if self.no_centering: return inner1d(A_new, (self.C_ICD_NC @ B_new.T).T) else: return inner1d(A_new - self.a_mean, (self.C_ICD @ (B_new - self.b_mean).T).T) PK!EEphsic/utils.pyimport numpy as np def sample_from_unit_hypersphere(dim): # 半径dimの単位超球面から乱択 # via. [球面上に一様分布する乱数の生成 - Qiita](https://qiita.com/mh-northlander/items/a2e643cf62317f129541) x = np.random.standard_normal(dim) r = np.linalg.norm(x) return x / r PK!H[1&(*phsic_cli-0.1.0.dist-info/entry_points.txtN+I/N.,()*(LzVy\\PK!`E5+!phsic_cli-0.1.0.dist-info/LICENSEBSD 3-Clause License Copyright (c) 2018, Sho Yokoi All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PK!H\TTphsic_cli-0.1.0.dist-info/WHEEL 1 0 нR \I$ơ7.ZON `h6oi14m,b4>4ɛpK>X;baP>PK!H Ab"phsic_cli-0.1.0.dist-info/METADATAXnG}|EA6 8ТLFtrlG0 3E͙Iw(*y~~о!4!:uS|e04qyoX}('~soQz(Hraf\#߿=H'  K?"EWܠ"bp{! K6&?cJs3A*4i^&ݽ94zHՇD9|!`<ʾyP, >[ 4'uJ$t)7֚8+vRSryHɱbE^11(C_{ߒ[]xC?}q $\VS6jj\QUQέFD:;qJDYt K|92%j,z[(bU=Ì.dZ7 [@ e)(wOa]NUb)57R;_ԩ⥡~ W?hMi&g\??zۗEYeVJY͇0D3C`f桡2FY΃%jKjWˢdH|V1RL`Yul {`9.j[Lup(/pfL93΁@՗JaS5__yg e1OhDIJl8uBm[N"=[ޔߞTwrQҽjzhX鳅5Qj4B 8i> 湤l̸wM "ϙwrr݅L>8wl:rĭ#$2Kh@S("UO0/k<Ǭ}h V*p )a0n V1F G+عlƳ v'*6$ 3?~48%V>}}ʞ5^dxdR65{?{6HSفg?> ̙n"_׫,U%w

XC+$$nZN]l:'$يnҍmȋ$vV.-$n!z^ YMcPб?E_m?ä0w'݆v[K:?!3=KRWS\[}^MW'6Ϋ)ĚPT䆍pCѸ7бt[tydcϩ7x# RJd@Ӟh\ѹH?hk(.QF/FXcYQrSkzej2ȓ;Gs NR\ڎt+]Gr`C/=ʠ䠝]J|?\5]S9Nv`1 Rܭڝ6^3R7ivMd.%)6_kPK!HJ9< phsic_cli-0.1.0.dist-info/RECORD}I@-4t怀+"r!ndNk Sk Q''sKE9MJ;CDBU`9e ̢ SRyRݬ;YLAMJܵn^ FztKSt ,B[=U~%/`GcC!@|qSՊ⨧:gn*q=BUU{p]U@mY ŠskOVAUGR$G ̭w[_ k L-yҘVB)lU6lPK!R~phsic/__init__.pyPK!K## Fphsic/app.pyPK!"m$phsic/config.pyPK!8"f(f(?'phsic/encoder.pyPK!&YddOphsic/kernel.pyPK!&&diphsic/phsic.pyPK!EEphsic/utils.pyPK!H[1&(*'phsic_cli-0.1.0.dist-info/entry_points.txtPK!`E5+!phsic_cli-0.1.0.dist-info/LICENSEPK!H\TT֋phsic_cli-0.1.0.dist-info/WHEELPK!H Ab"gphsic_cli-0.1.0.dist-info/METADATAPK!HJ9< Yphsic_cli-0.1.0.dist-info/RECORDPK ;ӕ