PK!#jgel/__init__.pyimport gel.gelcd import gel.gelfista import gel.gelpaths import gel.ridgepaths __version__ = "1.0.0" __all__ = ["gelcd", "gelfista", "gelpaths", "ridgepaths"] PK!?J,, gel/gelcd.py"""gelcd.py: coordinate descent for group elastic net. This implementation uses block-wise coordinate descent to solve the optimization problem: min_b (1/2m)||y - b_0 - sum_j A_j@b_j||^2 + sum_j {sqrt{n_j}(l_1*||b_j|| + l_2*||b_j||^2)}. The algorithm repeatedly minimizes with respect to b_0 and the b_js. For b_0, the minimization has a closed form solution. For each b_j, the minimization objective is convex, twice differentiable; and can be solved using gradient descent, Newton's method etc. This implementation allows the internal minimizer to be chosen. These are the block_solve_* functions. Each solves the above optimization problem with respect to one of the b_js, while keeping the others fixed. The gradient with respect to b_j is (-1/m)A_j'@(y - b_0 - sum_j A_j@b_j) + sqrt{n_j}(l_1*b_j/||b_j|| + 2*l_2*b_j). and the Hessian is (1/m)A_j'@A_j + sqrt{n_j}(l_1*(I/||b_j|| - b_j@b_j'/||b_j||^3) + 2*l_2*I). The coefficients are represented using a scalar bias b_0 and a matrix B where each row corresponds to a single group (with appropriate 0 padding). The root of the group sizes are stored in a vector sns. The features are stored in a list of tensors each of size m x n_j where m is the number of samples, and n_j is the number of features in group j. For any j, r_j = y - b_0 - sum_{k =/= j} A_k@b_k, q_j = r_j - A_j@b_j, C_j = A_j'@A_j / m, and I_j is an identity matrix of size n_j. Finally, a_1 = l_1*sns and a_2 = 2*l_2*sns. """ import torch import tqdm def _f_j(q_j, b_j_norm, a_1_j, a_2_j, m): """Compute the objective with respect to one of the coefficients i.e. (1/2m)||q_j||^2 + a_1||b_j|| + (a_2/2)||b_j||^2. """ return ( ((q_j @ q_j) / (2.0 * m)) + (a_1_j * b_j_norm) + ((a_2_j / 2.0) * (b_j_norm ** 2)) ) def _grad_j(q_j, A_j, b_j, b_j_norm, a_1_j, a_2_j, m): """Compute the gradient with respect to one of the coefficients.""" return (A_j.t() @ q_j / (-m)) + (b_j * (a_1_j / b_j_norm + a_2_j)) def _hess_j(C_j, I_j, b_j, b_j_norm, a_1_j, a_2_j): """Compute the Hessian with respect to one of the coefficients.""" D_j = torch.ger(b_j, b_j) return ( C_j + (a_1_j / b_j_norm) * (I_j - D_j / (b_j_norm ** 2)) + a_2_j * I_j ) def block_solve_agd( r_j, A_j, a_1_j, a_2_j, m, b_j_init, t_init=None, ls_beta=None, max_iters=None, rel_tol=1e-6, verbose=False, zero_thresh=1e-6, zero_fill=1e-3, ): """Solve the optimization problem for a single block with accelerated gradient descent.""" b_j = b_j_init b_j_prev = b_j k = 1 # iteration number t = 1 # initial step length (used if t_init is None) pbar_stats = {} # stats for the progress bar pbar = tqdm.tqdm( desc="Solving block with AGD", disable=not verbose, leave=False ) while True: # Compute the v terms. mom = (k - 2) / (k + 1.0) v_j = b_j + mom * (b_j - b_j_prev) q_v_j = r_j - A_j @ v_j v_j_norm = v_j.norm(p=2) f_v_j = _f_j(q_v_j, v_j_norm, a_1_j, a_2_j, m) grad_v_j = _grad_j(q_v_j, A_j, v_j, v_j_norm, a_1_j, a_2_j, m) b_j_prev = b_j # Adjust the step size with backtracking line search. if t_init is not None: t = t_init while True: b_j = v_j - t * grad_v_j # gradient descent update if ls_beta is None: # Don't perform line search. break # Line search: exit when f_j(b_j) <= f_j(v_j) + grad_v_j'@(b_j - # v_j) + (1/2t)||b_j - v_j||^2. q_b_j = r_j - A_j @ b_j b_j_norm = b_j.norm(p=2) f_b_j = _f_j(q_b_j, b_j_norm, a_1_j, a_2_j, m) b_v_diff = b_j - v_j c2 = grad_v_j @ b_v_diff c3 = b_v_diff @ b_v_diff / 2.0 if t * f_b_j <= t * (f_v_j + c2) + c3: break else: t *= ls_beta # Make b_j non-zero if it is 0. if len((b_j.abs() < zero_thresh).nonzero()) == len(b_j): b_j.fill_(zero_fill) b_j_norm = b_j.norm(p=2) b_diff_norm = (b_j - b_j_prev).norm(p=2) pbar_stats["t"] = "{:.2g}".format(t) pbar_stats["rel change"] = "{:.2g}".format(b_diff_norm / b_j_norm) pbar.set_postfix(pbar_stats) pbar.update() # Check max iterations exit criterion. if max_iters is not None and k == max_iters: break k += 1 # Check tolerance exit criterion. Exit when the relative change is less # than the tolerance. # k > 2 ensures that at least 2 iterations are completed. if b_diff_norm <= rel_tol * b_j_norm and k > 2: break pbar.close() return b_j def block_solve_newton( r_j, A_j, a_1_j, a_2_j, m, b_j_init, C_j, I_j, ls_alpha=0.5, ls_beta=0.9, max_iters=20, tol=1e-8, verbose=False, ): """Solve the optimization problem for a single block with Newton's method.""" b_j = b_j_init k = 1 pbar_stats = {} # stats for the progress bar pbar = tqdm.tqdm( desc="Solving block with Newton's method", disable=not verbose, leave=False, ) while True: # First, compute the Newton step and decrement. q_b_j = r_j - A_j @ b_j b_j_norm = b_j.norm(p=2) grad_b_j = _grad_j(q_b_j, A_j, b_j, b_j_norm, a_1_j, a_2_j, m) hess_b_j = _hess_j(C_j, I_j, b_j, b_j_norm, a_1_j, a_2_j) hessinv_b_j = torch.inverse(hess_b_j) v_j = hessinv_b_j @ grad_b_j dec_j = grad_b_j @ (hessinv_b_j @ grad_b_j) # Check tolerance stopping criterion. Exit if dec_j / 2 is less than the # tolerance. if dec_j / 2 <= tol: break # Perform backtracking line search. t = 1 f_b_j = _f_j(q_b_j, b_j_norm, a_1_j, a_2_j, m) k_j = grad_b_j @ v_j while True: # Compute the update and evaluate function at that point. bp_j = b_j - t * v_j q_bp_j = r_j - A_j @ bp_j bp_j_norm = bp_j.norm(p=2) f_bp_j = _f_j(q_bp_j, bp_j_norm, a_1_j, a_2_j, m) if f_bp_j <= f_b_j - ls_alpha * t * k_j: b_j = bp_j break else: t *= ls_beta # Make b_j non-zero if it is 0. if all(b_j.abs() < tol): b_j.fill_(1e-3) pbar_stats["t"] = "{:.2g}".format(t) pbar_stats["1/2 newton decrement"] = "{:.2g}".format(dec_j / 2) pbar.set_postfix(pbar_stats) pbar.update() # Check max iterations stopping criterion. if max_iters is not None and k == max_iters and k > 2: break k += 1 pbar.close() return b_j def make_A( As, ns, device=torch.device("cpu"), dtype=torch.float32 ): # pylint: disable=unused-argument """Move the As to the provided device, convert to the provided dtype, and return as A.""" As = [A_j.to(device, dtype) for A_j in As] return As def gel_solve( A, y, l_1, l_2, ns, b_init=None, block_solve_fun=block_solve_agd, block_solve_kwargs=None, max_cd_iters=None, rel_tol=1e-6, Cs=None, Is=None, verbose=False, ): """Solve a group elastic net problem. Arguments: A: list of feature tensors, one per group (size m x n_j). y: tensor vector of predictions. l_1: the 2-norm coefficient. l_2: the squared 2-norm coefficient. ns: iterable of group sizes. b_init: tuple (b_0, B) to initialize b. block_solve_fun: the function to be used for minimizing individual blocks (should be one of the block_solve_* functions). block_solve_kwargs: dictionary of arguments to be passed to block_solve_fun. max_cd_iters: maximum number of outer CD iterations. rel_tol: tolerance for exit criterion; after every CD outer iteration, b is compared to the previous value, and if the relative difference is less than this value, the loop is terminated. Cs, Is: lists of C_j and I_j matrices respectively; used if the internal solver is Newton's method. verbose: boolean to enable/disable verbosity. """ p = len(A) m = len(y) device = A[0].device dtype = A[0].dtype y = y.to(device, dtype) if block_solve_kwargs is None: block_solve_kwargs = dict() # Create initial values if not specified. if b_init is None: b_init = 0.0, torch.zeros(p, max(ns), device=device, dtype=dtype) if not isinstance(ns, torch.Tensor): ns = torch.tensor(ns) sns = ns.to(device, dtype).sqrt() a_1 = l_1 * sns ma_1 = m * a_1 a_2 = 2 * l_2 * sns b_0, B = b_init b_0_prev, B_prev = b_0, B k = 1 # iteration number pbar_stats = {} # stats for the outer progress bar pbar = tqdm.tqdm( desc="Solving gel with CD (l_1 {:.2g}, l_2 {:.2g})".format(l_1, l_2), disable=not verbose, ) while True: # First minimize with respect to b_0. This has a closed form solution # given by b_0 = 1'@(y - sum_j A_j@b_j) / m. b_0 = (y - sum(A[j] @ B[j, : ns[j]] for j in range(p))).sum() / m # Now, minimize with respect to each b_j. for j in tqdm.trange( p, desc="Solving individual blocks", disable=not verbose, leave=False, ): r_j = ( y - b_0 - sum(A[k] @ B[k, : ns[k]] for k in range(p) if k != j) ) # Check if b_j must be set to 0. The condition is ||A_j'@r_j|| <= # m*a_1. if (A[j].t() @ r_j).norm(p=2) <= ma_1[j]: B[j] = 0 else: # Otherwise, minimize. First make sure initial value is not 0. if len((B[j, : ns[j]].abs() < 1e-6).nonzero()) == ns[j]: B[j, : ns[j]] = 1e-3 # Add C_j and I_j to the arguments if using Newton's method. if block_solve_fun is block_solve_newton: block_solve_kwargs["C_j"] = Cs[j] block_solve_kwargs["I_j"] = Is[j] B[j, : ns[j]] = block_solve_fun( r_j, A[j], a_1[j].item(), a_2[j].item(), m, B[j, : ns[j]], verbose=verbose, **block_solve_kwargs ) # Compute relative change in b. b_0_diff = b_0 - b_0_prev B_diff = B - B_prev delta_norm = (b_0_diff ** 2 + (B_diff ** 2).sum()).sqrt() b_norm = (b_0 ** 2 + (B ** 2).sum()).sqrt() pbar_stats["rel change"] = "{:.2g}".format( delta_norm.item() / b_norm.item() ) pbar.set_postfix(pbar_stats) pbar.update() # Check max iterations exit criterion. if max_cd_iters is not None and k == max_cd_iters: break k += 1 # Check tolerance exit criterion. if delta_norm.item() <= rel_tol * b_norm.item() and k > 2: break b_0_prev, B_prev = b_0, B pbar.close() return b_0.item(), B PK!ͥdpgel/gelfista.py"""gelfista.py: FISTA like algorithm for group elastic net. This implementation uses accelerated proximal gradient descent to solve the optimization problem: min_b (1/2m)||y - b_0 - sum_j A_j@b_j||^2 + sum_j {sqrt{n_j}(l_1*||b_j|| + l_2*||b_j||^2)} Here, and everywhere else, the "@" symbol is used to denote matrix multiplication. Denoting the first part of the objective by g, and the second part by h, the algorithm can be written as v = b^{k-1} + ((k-2)/(k-1))*(b^{k-1} - b^{k-2}) b^k = prox_{t_k}(v - t_k*grad(g(v))) Here t_k is a step length chosen by backtracking line search, and prox is the proximal operator of h. The functions in this module use certain variables and conventions. These will be defined next. The coefficients are represented using a scalar bias b_0 and a matrix B where each row corresponds to a single group (with appropriate 0 padding). The root of the group sizes are stored in a matrix sns broadcasted to the shape of B. The features are stored in a 3D tensor A of size #groups x max{n_j} x m. Two helper variables are used: a_1 = l_1*sns, and a_2 = 2*l_2*sns. """ import torch import tqdm def _prox(B, a_1, a_2, t): """Compute the prox operator of h. In this case, this reduces to a simple soft thresholding: prox_t(x)_0 = x_0 prox_t(x)_j = if ||x_j|| > t*l_1*sqrt(n_j): (x_j/||x_j||)*((||x_j|| - t*l_1*sqrt(n_j)) / (1 + 2*t*l_2*sqrt(n_j))) else: 0 Here x_j denotes the coefficient corresponding to the j^th group. Arguments have usual meanings. b_0 is not passed to the function since it is not changed. """ # First compute assuming 'if' part of the condition. ta_1 = t * a_1 norms = B.norm(p=2, dim=1, keepdim=True).expand_as(B) shrinkage = (norms - ta_1) / (1 + t * a_2) prox = B * shrinkage / norms # Threshold to zero if ||x_j|| is below threshold. thresh_cond = norms <= ta_1 prox[thresh_cond] = 0 return prox def _r(A, b_0, B, y): """Compute y - b_0 - sum_j A_j@b_j.""" # bmm will return a tensor with individual A_j@b_j products. AB = torch.bmm(A.permute(0, 2, 1), B.unsqueeze(2)).sum(dim=0).squeeze() return y - b_0 - AB def _g(A, b_0, B, y, m): """Compute g(b).""" # m is the number of samples. r = _r(A, b_0, B, y) return r @ r / (2.0 * m) def _grad(A, b_0, B, y, p, m): """Compute the gradient of g. Gradient for b_0 is 1'@r / -m, and gradient for b_j is A_j'@r / -m. """ r = _r(A, b_0, B, y) grad_b_0 = r.sum() / -m # r must be broadcasted for the next operation. r = r.unsqueeze(0).unsqueeze(2).expand(p, m, 1) grad_B = torch.bmm(A, r).squeeze() / -m if len(grad_B.shape) < len(B.shape): grad_B = grad_B.unsqueeze(1) return grad_b_0, grad_B def make_A(As, ns, device=torch.device("cpu"), dtype=torch.float32): """Create the 3D tensor A as needed by gel_solve, given a list of feature matrices. Arguments: As: list of feature matrices, one per group (size mxn_j). ns: LongTensor of group sizes. device: torch device (default cpu). dtype: torch dtype (default float32). """ A = torch.zeros( len(ns), ns.max(), As[0].shape[0], device=device, dtype=dtype ) for j, n_j in enumerate(ns): # Fill A[j] with A_j'. A[j, :n_j, :] = As[j].t() return A def gel_solve( A, y, l_1, l_2, ns, b_init=None, t_init=None, ls_beta=None, max_iters=None, rel_tol=1e-6, verbose=False, ): """Solve a group elastic net problem. Arguments: A: 3D tensor of features as described in the header, returned by make_A. y: tensor vector of predictions. l_1: the 2-norm coefficient. l_2: the squared 2-norm coefficient. ns: iterable of group sizes. b_init: tuple (b_0, B) to initialize b. t_init: initial step size to start line search; if None, it's set to the value from the previous iteration. ls_beta: shrinkage factor for line search; if None, no line search is performed, and t_init is used as the step size for every iteration. max_iters: maximum number of iterations; if None, there is no limit. rel_tol: tolerance for exit criterion. verbose: boolean to enable/disable verbosity. """ p = len(ns) m = len(y) device = A.device dtype = A.dtype y = y.to(device, dtype) # Create initial values if not specified. if b_init is None: b_init = 0.0, torch.zeros(p, max(ns), device=device, dtype=dtype) b_0, B = b_init b_0_prev, B_prev = b_0, B if not isinstance(ns, torch.Tensor): ns = torch.tensor(ns) sns = ns.to(device, dtype).sqrt().unsqueeze(1).expand_as(B) a_1 = l_1 * sns a_2 = 2 * l_2 * sns k = 1 # iteration number t = 1 # initial step length (used if t_init is None) pbar_stats = {} # stats for the progress bar pbar = tqdm.tqdm( desc="Solving gel with FISTA (l_1 {:.2g}, l_2 {:.2g})".format(l_1, l_2), disable=not verbose, ) while True: # Compute the v terms. mom = (k - 2) / (k + 1.0) v_0 = b_0 + mom * (b_0 - b_0_prev) V = B + mom * (B - B_prev) g_v = _g(A, v_0, V, y, m) grad_v_0, grad_V = _grad(A, v_0, V, y, p, m) b_0_prev, B_prev = b_0, B # Adjust the step size with backtracking line search. if t_init is not None: t = t_init while True: # Compute the update based on gradient, then apply prox. b_0 = v_0 - t * grad_v_0 B = V - t * grad_V B = _prox(B, a_1, a_2, t) if ls_beta is None: # Don't perform line search. break # The line search condition is to exit when g(b) <= g(v) + # grad_v'@(b - v) + (1/2t)||b - v||^2. g_b = _g(A, b_0, B, y, m) b0_v0_diff = b_0 - v_0 B_V_diff = B - V # grad_v'@(b - v): c_2 = grad_v_0 * b0_v0_diff + (grad_V * B_V_diff).sum() # (1/2t)||b - v||^2: c_3 = (b0_v0_diff ** 2 + (B_V_diff ** 2).sum()) / (2.0 * t) if g_b <= g_v + c_2 + c_3: break else: t *= ls_beta # Compute relative change in b. b_0_diff = b_0 - b_0_prev B_diff = B - B_prev delta_norm = (b_0_diff ** 2 + (B_diff ** 2).sum()).sqrt() b_norm = (b_0 ** 2 + (B ** 2).sum()).sqrt() pbar_stats["t"] = "{:.2g}".format(t) pbar_stats["rel change"] = "{:.2g}".format((delta_norm / b_norm).item()) pbar.set_postfix(pbar_stats) pbar.update() # Check max iterations exit criterion. if max_iters is not None and k == max_iters: break k += 1 # Check tolerance exit criterion. Break if the relative change in 2-norm # between b and b_prev is less than tol. # k > 2 ensures that at least 2 iterations are completed. if delta_norm.item() <= rel_tol * b_norm.item() and k > 2: break pbar.close() return b_0.item(), B PK!Hy//gel/gelpaths.py"""gelpaths.py: solution paths for group elastic net. This module implements a 2-stage process where group elastic net is used to find the support, and ridge regression is performed on it. """ import math import sys import torch from gel.ridgepaths import ridge_paths def _find_support(B, ns, supp_thresh): """Find features with non-zero coefficients.""" ns_cast = ns.unsqueeze(1).to(B.device, B.dtype) norms = (B ** 2).sum(dim=1, keepdim=True) / ns_cast support = (norms >= supp_thresh).expand_as(B) support = torch.cat([s_j[:n_j] for s_j, n_j in zip(support, ns)]) support = torch.nonzero(support)[:, 0] if not support.numel(): support = None return support def compute_ls_grid(As, y, sns_vec, m, ks, n_ls, l_eps, dtype): """Compute l values for each given k and return a dictionary mapping k to a list (in decreasing order) of lambda values. Arguments have the same meaning as in gel_paths2. sns_vec is a vector of sns_j values as opposed to the matrix computed in gel_paths2. """ ls_grid = {} # The bound is given by max{||A_j'@(y - b_0)||/(m*sqrt{n_j}*k)} where b_0 = # 1'@y/m. So most things can be precomputed. l_max_b_0 = y.mean() l_max_unscaled = max( (A_j.t() @ (y - l_max_b_0)).norm(p=2) / (m * sns_j) for A_j, sns_j in zip(As, sns_vec) ) for k in ks: l_max = l_max_unscaled / k if n_ls == 1: ls_grid[k] = [l_max] else: l_min = l_max * l_eps ls = torch.logspace( math.log10(l_min), math.log10(l_max), steps=n_ls, dtype=dtype ) ls = sorted([l.item() for l in ls], reverse=True) ls_grid[k] = ls return ls_grid def gel_paths( gel_solve, gel_solve_kwargs, make_A, As, y, l_1s, l_2s, l_rs, summ_fun, supp_thresh=1e-6, device=torch.device("cpu"), verbose=False, aux_rel_tol=1e-3, dtype=torch.float32, ): """Solve group elastic net to find support and perform ridge on it. The problem is solved for multiple values of l_1, l_2, and l_r (the ridge regularization), and the result for each is summarized using the given summary function. Arguments: gel_solve, make_A: functions from a gel implementation to be used internally. gel_solve_kwargs: dictionary of keyword arguments to be passed to gel_solve. As: list of feature matrices (same as in make_A). All features should be centered. y: vector of predictions. l_1s, l_2s, l_rs: list of values for l_1, l_2, and l_r respectively. summ_fun: function to summarize results (same as in ridge_paths). supp_thresh: for computing support, 2-norms below this value are considered 0. device: torch device. verbose: enable/disable verbosity. aux_rel_tol: relative tolerance for solving auxiliary problems. dtype: torch dtype. The function returns a dictionary mapping (l_1, l_2, l_r) values to their summaries. """ # We have to loop over l_1s, l_2s, and l_rs. Since multiple ridge # regressions can be performed efficiently with ridge_paths, l_rs should be # the inner most loop. Further, with l_1^1 > l_1^2 > ..., group elastic net # for (l_1^j, l_2) can be solved efficiently by warm starting at the # solution for (l_1^{j-1}, l_2). So, l_2s should be the outer most loop. # First compute various required variables. l_1s = sorted(l_1s, reverse=True) p = len(As) m = As[0].shape[0] As = [A_j.to(device, dtype) for A_j in As] ns = torch.tensor([A_j.shape[1] for A_j in As]) B_zeros = torch.zeros(p, ns.max().item(), device=device, dtype=dtype) sns = ns.to(device, dtype).sqrt().unsqueeze(1).expand_as(B_zeros) # Form the A matrix as needed by gel_solve. A = make_A(As, ns, device, dtype) # Form X which combines all the As (for ridge_paths). X = torch.cat(As, dim=1) X = X.t() # ridge_paths expects a pxm matrix y = y.to(device, dtype) if "Cs" in gel_solve_kwargs: gel_solve_kwargs["Cs"] = [ C_j.to(device, dtype) for C_j in gel_solve_kwargs["Cs"] ] if "Is" in gel_solve_kwargs: gel_solve_kwargs["Is"] = [ I_j.to(device, dtype) for I_j in gel_solve_kwargs["Is"] ] gel_solve_kwargs_aux = gel_solve_kwargs.copy() gel_solve_kwargs_aux["rel_tol"] = aux_rel_tol summaries = {} for l_2 in l_2s: b_init = 0.0, B_zeros # reset b_init for each l_2 # Find a good initialization to solve for the first (l_1, l_2) pair. if l_2 != 0 and l_1s[0] != 0: # Find k corresponding to the l_1, l_2. # l_1 = k*l, and l_2 = (1 - k)*l. k = 1.0 / (1.0 + l_2 / l_1s[0]) # Compute l_max corresponding to k. l_aux = compute_ls_grid(As, y, sns[:, 0], m, [k], 1, None, dtype)[k] l_aux = l_aux[0] # Solve with k, l_aux. if verbose: print( "Solving auxiliary problem to get good initialization", file=sys.stderr, ) b_init = gel_solve( A, y, k * l_aux, (1.0 - k) * l_aux, ns, b_init, verbose=verbose, **gel_solve_kwargs, ) if verbose: print("Done solving auxiliary problem", file=sys.stderr) for l_1 in l_1s: # Solve group elastic net initializing at the previous solution. b_0, B = gel_solve( A, y, l_1, l_2, ns, b_init, verbose=verbose, **gel_solve_kwargs ) b_init = b_0, B # Find support. support = _find_support(B, ns, supp_thresh) support_size = 0 if support is None else len(support) if verbose: print("Support size: {}".format(support_size), file=sys.stderr) # Solve ridge on support and store summaries. if support is not None: X_supp = X if support_size == X.shape[0] else X[support] else: X_supp = None ridge_summaries = ridge_paths( X_supp, y, support, l_rs, summ_fun, verbose ) del X_supp for l_r, summary in ridge_summaries.items(): summaries[(l_1, l_2, l_r)] = summary if verbose: print(file=sys.stderr) return summaries def gel_paths2( gel_solve, gel_solve_kwargs, make_A, As, y, ks, n_ls, l_eps, l_rs, summ_fun, supp_thresh=1e-6, device=torch.device("cpu"), verbose=False, ls_grid=None, aux_rel_tol=1e-3, dtype=torch.float32, ): """Solve for paths with a reparametrized group elastic net. Arguments: gel_solve, make_A: functions from a gel implementation to be used internally. gel_solve_kwargs: dictionary of keyword arguments to be passed to gel_solve. As: list of feature matrices (same as in make_A). All features should be centered. y: vector of predictions. ks: list of values for trading-off l1 and l2 regularizations. n_ls: number of lambda values. l_eps: ratio of minimum to maximum lambda value. l_rs: list of ridge regularization values. summ_fun: function to summarize results (same as in ridge_paths). supp_thresh: for computing support, 2-norms below this value are considered 0. device: torch device. verbose: enable/disable verbosity. ls_grid: pre-computed dictionary mapping each k in ks to a list of l values (in decreasing order). If this argument is not None, n_ls and l_eps are ignored. aux_rel_tol: relative tolerance for solving auxiliary problems. dtype: torch dtype. The regularization terms can be rewritten as l*(k*||b_j|| + (1 - k)*||b_j||^2) where k controls the tradeoff between the two norms, and l controls the overall strength. This function takes a list of tradeoff values through ks. For each k, an upper bound can be found on l (which will lead to an empty support). Using this upper bound, n_ls l values are computed on a log scale such that l_min / l_max = l_eps. Other arguments are same as in gel_paths. The function returns a dictionary mapping (l_1, l_2, l_r) values to their summaries. """ # Setup is mostly identical to gel_paths. p = len(As) m = As[0].shape[0] As = [A_j.to(device, dtype) for A_j in As] ns = torch.tensor([A_j.shape[1] for A_j in As]) B_zeros = torch.zeros(p, ns.max(), device=device, dtype=dtype) sns = ns.to(device, dtype).sqrt().unsqueeze(1).expand_as(B_zeros) A = make_A(As, ns, device, dtype) X = torch.cat(As, dim=1) X = X.t() y = y.to(device, dtype) if "Cs" in gel_solve_kwargs: gel_solve_kwargs["Cs"] = [ C_j.to(device) for C_j in gel_solve_kwargs["Cs"] ] if "Is" in gel_solve_kwargs: gel_solve_kwargs["Is"] = [ I_j.to(device) for I_j in gel_solve_kwargs["Is"] ] if ls_grid is None: ls_grid = compute_ls_grid(As, y, sns[:, 0], m, ks, n_ls, l_eps, dtype) ls_grid_self = None else: # Compute l values with self data to get good initializations. ls_grid_self = compute_ls_grid( As, y, sns[:, 0], m, ks, n_ls, l_eps, dtype ) gel_solve_kwargs_aux = gel_solve_kwargs.copy() gel_solve_kwargs_aux["rel_tol"] = aux_rel_tol summaries = {} for k in ks: b_init = 0.0, B_zeros # reset the initial value for each k ls = ls_grid[k] full_support = False if ls_grid_self is not None: # Solve with self l values to get a better initialization. # This _greatly_ speeds up the optimization. if verbose: print( "Solving auxiliary problems to get good initialization", file=sys.stderr, ) ls_self = ls_grid_self[k] for l_self in ls_self: if l_self < ls[0]: break b_init = gel_solve( A, y, k * l_self, (1.0 - k) * l_self, ns, b_init, verbose=verbose, **gel_solve_kwargs_aux, ) if verbose: print("Done solving auxiliary problems", file=sys.stderr) ridge_summaries = None for l in ls: # Convert k, l into l_1, l_2. l_1, l_2 = k * l, (1.0 - k) * l if full_support: # Just copy the previous summaries. for l_r, summary in ridge_summaries.items(): summaries[(l_1, l_2, l_r)] = summary continue # Rest is similar to gel_paths. b_0, B = gel_solve( A, y, l_1, l_2, ns, b_init, verbose=verbose, **gel_solve_kwargs ) b_init = b_0, B # Find support. support = _find_support(B, ns, supp_thresh) support_size = 0 if support is None else len(support) full_support = support_size == X.shape[0] if verbose: print("Support size: {}".format(support_size), file=sys.stderr) # Solve ridge on support and store summaries. if support is not None: X_supp = X if full_support else X[support] else: X_supp = None ridge_summaries = ridge_paths( X_supp, y, support, l_rs, summ_fun, verbose ) del X_supp for l_r, summary in ridge_summaries.items(): summaries[(l_1, l_2, l_r)] = summary if verbose: print(file=sys.stderr) return summaries PK!a gel/ridgepaths.py"""ridgepaths.py: solution paths for ridge regression.""" import torch import tqdm def ridge_paths(X, y, support, lambdas, summ_fun, verbose=False): """Solve ridge ridgression for a sequence of regularization values, and return the summary for each. The multiple solutions are obtained efficiently using the Woodbury identity. With X (p x m) representing the feature matrix, and y (m x 1) the outcomes, the ridge solution is given by b = (X@X' + l*I)^{-1}@X@y where l is the regularization coefficient. This can be reduced to (1/l)*(X@y - X@V(e + l*I)^{-1}@(X@V)'@X@y) where V@e@V' is the eigendecomposition of X'@X. Since (e + l*I) is a diagonal matrix, its inverse can be performed efficiently simply by taking the reciprocal of the diagonal elements. Then, (X@V)'@X@y is a vector; so it can be multiplied by (e + l*I)^{-1} just by scalar multiplication. Arguments: X: pxm tensor of features (where m is the number of samples); X should be centered (each row should have mean 0). y: tensor (length m vector) of outcomes. support: LongTensor vector of features to use. This vector should contain indices. It can also be None to indicate empty support. X should already be indexed using support. This argument is simply passed to the summary function. lambdas: list of regularization values for which to solve the problem. summ_fun: a function that takes (support, b) and returns an arbitrary summary. verbose: enable/disable the progress bar. The function returns a dictionary mapping lambda values to their summaries. """ summaries = {} if support is None: # Nothing to do. for l in tqdm.tqdm( lambdas, desc="Solving ridge regressions", disable=not verbose ): summaries[l] = summ_fun(None, None) return summaries # Setup. _, S, V = torch.svd(X) e = S ** 2 p = X @ y Q = X @ V r = Q.t() @ p # (X@V)'@X@y # Main loop. for l in tqdm.tqdm( lambdas, desc="Solving ridge regressions", disable=not verbose ): b = (1.0 / l) * (p - Q @ (r / (e + l))) summaries[l] = summ_fun(support, b) return summaries PK!ᔶ,00 torchgel-1.0.0.dist-info/LICENSEMIT License Copyright (c) 2017 Jayanth Koushik Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!Hu)GTUtorchgel-1.0.0.dist-info/WHEEL HM K-*ϳR03rOK-J,/R(O-)$qzd&Y)r$UV&UrPK!H {9 !torchgel-1.0.0.dist-info/METADATAXnSCR %”dIeY\cwwəQ ɕ G_}$w?$_u, ꫯ;\6r'e6Մ,Մc\bn7'mYJ).Bjd323[֤ Q”jT9.vݹntrL.e,M|<_LU;>ߪ奱P&Cl0 }ҷc|>RńΪWs~!7mrKⰇ>?>?}L+x~.Taj>.ĭɄӈ#=QUr:jsLI}8CoE<.}rLGumL|׽%ռE~j^hw},SdU=cjjz毭[Y`uڏ|?~/z+NM uYo4Erte/W, lps`JtrOk&]\УG[}vY#xo7Tm-ُkM\o:j%[?`78ؙʚo^N荪hzr5al7eLv 繹CEt2;b1PBrP"X?^=$g_AuCSE3ݡ;߿jCQ~ ݐdSg-`t+&9K>l> >2Peb.P+{ g5rܘogfeF̀m8?{1$Mt{Yf#.1B`R|X7?e Ń?B2R?x;\\ڏ?W/j5$(8fL;D (K{'0}oxKB~|!25,M!L*cKνm Y5o)lL;G:bhZb7KŽF+x\0}R P[ P'>ψ Fl9@82vsTڹ6czec}hY$Oe#12]_%Eq`f  ( -▷Y݄Mf'v)D.UH\wibpwK#8K2)rS+]"{Z:ȎXM[1]I%w *\#"KJk]Bi ' P~%7 4[}#Y rrIZTA9cE SD/ӝ8ssG "U{u`;:ڹy_!1=i]P! uYcLyt1I3/,$V7LL<~ IN$UY#XKk(;o0͵\U^k0o'ݧސ#ѐ"8!ݽ{~R8I oV0QOh%=Dʾ ٬isJm߁ 2 lA)-Lw!yflx<9кʇ*9Lmƪ5U¯©HZ̖EE슴YnkF;BbHy˅q{XD"lt3/ OtoQ3ZA5*7,V7!-/e`E/BWvQ`o5D6yuޮ>KOٛTsݼhbc \vksski|PK!HoEtorchgel-1.0.0.dist-info/RECORD}K@໿A>̡VPA| All2J]Te3 a~!I)N(J1x%$Asb${2ɛAèoVYܪWpˮq8PkA@8U*i1 B˹@w\ j ?okEӗF4ޜ7Zy,ܜ\L퐜.{+=J~"s"R*tsM<nF 6aL=.x "&q&y^:m>MC%y\BH̿S 2!v{ev1" uBQN+Eɺ91X]aRu`=r[vmheap*D?u{QvG}p,JϲKTPK!#jgel/__init__.pyPK!?J,, gel/gelcd.pyPK!ͥdp-gel/gelfista.pyPK!Hy//Jgel/gelpaths.pyPK!a zgel/ridgepaths.pyPK!ᔶ,00 torchgel-1.0.0.dist-info/LICENSEPK!Hu)GTUPtorchgel-1.0.0.dist-info/WHEELPK!H {9 !torchgel-1.0.0.dist-info/METADATAPK!HoEXtorchgel-1.0.0.dist-info/RECORDPK fG