M3C 2017 Homework 2 solution

The complete Python script, hw2soln.py:

"""Homework 2 Solution"""

import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from hw2mod0 import cost_soln as cost #assumes cost.f90 has been compiled with f2py into cost.so
from hw2mod0 import hw2soln as hw2
from time import time
import matplotlib.colors as colors
from mpl_toolkits.mplot3d import Axes3D

def visualize(Nx,Ny,xlims=(-2,2),ylims=(-1,3),type='contour'):
    """Visualize cost function defined in cost.costj
        contour plot is generated when type='contour', otherwise
        a surface plot is generated
    """

    #generate grid, initialize cost
    x = np.linspace(xlims[0],xlims[1],Nx)
    y = np.linspace(ylims[0],ylims[1],Ny)
    c = np.zeros((Ny,Nx))

    #compute cost, J(x,y) (better to use a vectorized version of cost.costj that takes array input)
    cost.c_noise = False
    for i,xi in enumerate(x):
        for j,yj in  enumerate(y):
            c[j,i] = cost.costj([xi,yj])
    c = c + 1e-16 #to avoid zero values when taking log below

    #repeat above with noise added
    cost.c_noise = True
    cost.c_noise_amp = 1.0
    c2 = np.zeros((Ny,Nx))
    for i,xi in enumerate(x):
        for j,yj in  enumerate(y):
            c2[j,i] = cost.costj([xi,yj])

    #Choose contour levels for noisy data
    c2range = c2.max()-c2.min()
    levels = np.logspace(-3,np.log10(c2range),50)+c2.min()

    if type is 'contour':
        #2D contour plots
        plt.figure()
        C = plt.contour(x,y,np.log10(c),50)
        plt.xlabel('x')
        plt.ylabel('y')
        plt.title('log10(J(x,y))')
        plt.colorbar(C)


        plt.figure()
        C2 = plt.contour(x,y,c2,levels,norm=colors.SymLogNorm(2))
        plt.xlabel('x')
        plt.ylabel('y')
        plt.title('log10(J(x,y))) with noise')
        bar = plt.colorbar(C2,format='%.2E')

    else:
        #3D surface plots
        fig = plt.figure()
        ax = Axes3D(fig)
        xg,yg = np.meshgrid(x,y)
        surf = ax.plot_surface(xg, yg,np.log10(c),cmap='jet',ccount=200)
        plt.colorbar(surf,shrink=0.5)
        plt.xlabel('x')
        plt.ylabel('y')
        plt.title('log10(J(x,y)))')
        ax.view_init(60,-60)

        #3D surface plots
        fig2 = plt.figure()
        ax2 = Axes3D(fig2)
        surf2 = ax2.plot_surface(xg, yg,np.log10(c2-c2.min()),cmap='jet',ccount=200)
        plt.colorbar(surf2,shrink=0.5)
        plt.xlabel('x')
        plt.ylabel('y')
        plt.title('log10(J(x,y)-Jmin) with noise')
        ax2.view_init(60,-60)

        if type is 'surf':
            return xg,yg,c2,ax2,surf2

def newton_test(xg,display=False):
    """ Use Newton's method to minimize cost function defined in cost module
    Input variable xg is initial guess for location of minimum. When display
    is true, a figure illustrating the convergence of the method should be
    generated

    Output variables: xf -- computed location of minimum, jf -- computed minimum
    Further output can be added to the tuple, output, as needed. It may also
    be left empty.
    """
    xf,jf = hw2.newton(xg)
    npath = hw2.xpath.shape[0]

    if display:
        plt.figure()
        plt.subplot(211); plt.plot(hw2.xpath[-1,0],hw2.xpath[-1,1],'r*')
        plt.plot(hw2.xpath[:,0],hw2.xpath[:,1],'x--')
        plt.legend(('solution',))
        plt.xlabel('x');plt.ylabel('y');plt.title('X-Y location and cost during Newton iterations, xg=%2.2f,%2.2f'%(xg[0],xg[1]))
        plt.subplot(212); plt.semilogy(hw2.jpath+1e-16,'x--')
        plt.xlabel('iteration');plt.ylabel('Cost');plt.xlim(0,npath)

        output = (npath,)
    return xf,jf,npath


def bracket_descent_test(xg,display=False):
    """ Use bracket-descent to minimize cost function defined in cost module
    Input variable xg is initial guess for location of minimum. When display
    is true, 1-2 figures comparing the B-D and Newton steps should be generated

    Output variables: xf -- computed location of minimum, jf -- computed minimum
        """
    """ From question 2, we know that Newton's method converges extremely
    quickly for this cost function for a variety of guesses, so comparing
    the b-d and Newton steps allows us to evaluate the quality of the b-d
    algorithm. The details of the comparison will depend on the initial guess
    but broadly there is an initial adjustment period where b-d steps are smaller
    and in a different direction from Newton steps. Subsequently, the angles
    tend to align, and amplitudes reduce at similar rates. Close to the minimum,
    the step-size decreases exponentially. Occasionally, the b-d method moves
    in the 'wrong' direction suggesting a deficiency in the algorithm.
    """

    xf,jf = hw2.bracket_descent(xg)

    nsteps = hw2.xpath.shape[0]
    h_n = np.zeros((nsteps-1,2))
    h_bd = h_n.copy()

    for i,x in enumerate(hw2.xpath[:-1,:]):
        #compute newton step
        cost.c_x = x
        g = cost.costj_grad2d(x)
        H = cost.costj_hess2d(x)
        h_n[i,:] = -np.linalg.solve(H,g)
        h_bd[i,:] = hw2.xpath[i+1,:]-hw2.xpath[i,:]

        A_n = np.sqrt(h_n[:,0]**2 + h_n[:,1]**2)
        A_bd = np.sqrt(h_bd[:,0]**2 + h_bd[:,1]**2)

        theta_n = np.arctan(h_n[:,1]/h_n[:,0])*180/np.pi
        theta_bd = np.arctan(h_bd[:,1]/h_bd[:,0])*180/np.pi

    if display:
        plt.figure()
        plt.semilogy(A_n,'x--')
        plt.semilogy(A_bd,'o--')
        plt.legend(('Newton','B-D'))
        plt.title('Amplitude of Newton and B-D steps, xg=%2.2f,%2.2f'%(xg[0],xg[1]))
        plt.xlabel('iteration')
        plt.ylabel('|h|')

        plt.figure()
        plt.plot(theta_n,'x--')
        plt.plot(theta_bd,'o--',markersize=4)
        plt.legend(('Newton','B-D'))
        plt.title('Angle of Newton and B-D steps, xg=%2.2f,%2.2f'%(xg[0],xg[1]))
        plt.xlabel('iteration')
        plt.ylabel('$tan^{-1} (h_y/h_x)$')
        plt.show()
    return xf,jf,h_n,h_bd
#--------------------------

def performance():
    """Compare b-d and scipy l-bfgs-b software on cost function with
    and without added noise
    no noise: The b-d method requires a relatively large number of iterations,
    however is significantly faster than scipy-l-bfgs-b which points to the
    importance of optimizations carried out by the Fortran compiler as well
    as the relatively complicated bfgs algorithm which requires matrix-vector
    multiplication as well as iterative updates of the approximate inverse Hessian.
    The b-d scheme was modified so that iterations terminated when the cost
    was within tol of the cost computed by minimize.

    noise: This is very complicated. For noise levels less than about 1e-4
    we see results very similar to what is observed for noise-free calculations.
    For noise levels ~O(1), neither method is reliable. The b-d method
    "converges" to a local minimum which can be larger than the value produced
    by minimize even when minimize doesn't converge. For intermediate noise
    amplitudes, there is strong sensitivity to the guess, with minimize tending
    not to converge as the guess moves away from (1,1).
    B-D again typically
    does converge, but the estimated minimum is not reliable.
    Original convergence method is used here rather than the bespoke method used
    for the no-noise case.
    Not required: simple fix to improve b-d is to increase size of initial triangle
    as noise amplitude increases.


    """

    #Initialize variables
    x0values = [-100,-50,-10,-1]
    min_tol = 1.0e-6 #set tol for miminize to avoid precision issues from costs being 1e-16

    result_min1=[]
    result_min2=[]

    xf = np.zeros((4,2))
    jf = np.zeros(4)
    niter_bd = np.zeros(4)
    niter_min1 = np.zeros(4)
    niter_min2 = np.zeros(4)
    t_bd = np.zeros(4)
    t_min1 = np.zeros(4)
    t_min2 = np.zeros(4)

    def cost_grad(x):
        """alternative to cost.costj_grad2d (not used)"""
        g = np.zeros_like(x)
        g[0] = 2*(x[0]-1) - 4*20*x[0]*(x[1] - x[0]**2)
        g[1] = 2*20*(x[1] - x[0]**2)
        return g

    #no noise-------------
    cost.c_noise = False
    hw2.convergence_method = 2
    #gather niter, results
    for i,x in enumerate(x0values):
        xg = [x,-3]
        result_min = minimize(cost.costj,xg,method='L-BFGS-B',jac=False,tol=min_tol)
        niter_min1[i] = result_min.nit
        result_min1.append(result_min)
        result_min = minimize(cost.costj,xg,method='L-BFGS-B',jac=cost.costj_grad2d,tol=min_tol)
        niter_min2[i] = result_min.nit
        result_min2.append(result_min)
        hw2.tol  = max(1e-15,result_min.fun)
        xf[i],jf[i] = hw2.bracket_descent(xg)
        niter_bd[i] = hw2.xpath.shape[0]

    plt.figure()
    plt.plot(x0values,niter_bd,'x--')
    plt.plot(x0values,niter_min1,'.--')
    plt.plot(x0values,niter_min2,'+--')
    plt.xlabel('initial x_1')
    plt.xlabel('initial $x_1$')
    plt.ylabel('number of iterations')
    plt.grid()
    plt.xlim(-100,0)
    plt.legend(('B-D','L-BFSG-B, approx grad','L-BFSG-B exact grad'))

    loops = 50
    #gather timing info
    for i,x in enumerate(x0values):
        xg = [x,-3]
        t1 = time()
        for j in range(loops):
            out = minimize(cost.costj,xg,method='L-BFGS-B',jac=False,tol=min_tol)
        t2 = time()
        t_min1[i] = (t2-t1)/loops

        t1 = time()
        for j in range(loops):
            out = minimize(cost.costj,xg,method='L-BFGS-B',jac=cost.costj_grad2d,tol=min_tol)
        t2 = time()
        t_min2[i]=(t2-t1)/loops

        hw2.tol  = max(1e-15,out.fun)
        t1 = time()
        for j in range(loops):
            out = hw2.bracket_descent(xg)
        t2=time()
        t_bd[i] = (t2-t1)/loops


    plt.figure()
    plt.semilogy(x0values,t_bd,'x--',label='B-D')
    plt.semilogy(x0values,t_min1,'.--',label='L-BFSG-B, approx grad')
    plt.semilogy(x0values,t_min2,'+--',label='L-BFSG-B, exact grad')
    plt.legend()
    plt.xlim(-100,0)
    plt.xlabel('initial $x_1$')
    plt.ylabel('walltime')

    #--- added noise------------------
    noise_values = np.logspace(-3,0,7)
    x0values = [-1,-8]
    ng = len(x0values)
    cost.c_noise = True
    result_min_noise=[]
    xn_bd = np.zeros((7,2));jn_bd = np.zeros((7,ng))
    jn_min = jn_bd.copy()
    niter_bd = np.zeros((7,ng))
    niter_min = np.zeros((7,ng))
    flag = np.zeros((7,ng))
    hw2.tol = 1e-12
    hw2.convergence_method = 1

    for i,noise in enumerate(noise_values):
        cost.c_noise_amp = noise
        for j,x in enumerate(x0values):
            xg = [x,-3]
            result_min = minimize(cost.costj,xg,method='L-BFGS-B',jac=False,tol=min_tol)
            niter_min[i,j] = result_min.nit
            jn_min[i,j] = result_min.fun
            result_min_noise.append(result_min)
            if result_min.success:
                flag[i,j] = 1
            xn_bd,jn_bd[i,j], = hw2.bracket_descent(xg)
            niter_bd[i,j] = hw2.xpath.shape[0]
            print(xn_bd,jn_bd[i,j],niter_bd[i,j],result_min.x,result_min.fun,flag[i,j])

    # iterations plot
    plt.figure()
    for  j,x in enumerate(x0values):
        plt.semilogx(noise_values,niter_bd[:,j],'x--',label='b-d, xg=%2.1f,-3' %(x))
        plt.semilogx(noise_values,niter_min[:,j],'.--',label='l-bfgs-b, xg=%2.1f,-3' %(x))

    count=0
    for i,n in enumerate(noise_values):
        for j,x in enumerate(x0values):
            if flag[i,j]==0:
                if count==0:
                    plt.semilogx(n,niter_min[i,j],'go',label='did not converge')
                    count+=1
                else:
                    plt.semilogx(n,niter_min[i,j],'go')

    plt.xlabel('noise amplitude')
    plt.ylabel('iterations')
    plt.title('Influence of added noise on convergence of optimizers')
    plt.legend(loc='best')


    #cost plot
    plt.figure()
    for  j,x in enumerate(x0values):
        plt.semilogx(noise_values,jn_bd[:,j],'x--',label='b-d, xg=%2.1f,-3' %(x))
        plt.semilogx(noise_values,jn_min[:,j],'.--',label='l-bfgs-b, xg=%2.1f,-3' %(x))

    count=0
    for i,n in enumerate(noise_values):
        for j,x in enumerate(x0values):
            if flag[i,j]==0:
                    if count==0:
                        plt.semilogx(n,jn_min[i,j],'go',label='did not converge')
                        count+=1
                    else:
                        plt.semilogx(n,jn_min[i,j],'go')

    plt.xlabel('noise amplitude')
    plt.ylabel('Estimated minimum cost')
    plt.title('Influence of added noise on estimated minimum values')
    plt.legend(loc='best')
    plt.show()
    """
    plt.xlabel('noise amplitude')
    plt.ylabel('iterations')
    plt.title('Influence of added noise on convergence of optimizers')
    plt.legend(('B-D','L-BFSG-B, approx grad','did not converge'))
    """

    return n,niter_min,niter_bd




if __name__ == '__main__':
    run_visualize,run_newton,run_bd,run_performance = False,True,False,False

    if run_visualize:
        #Generate figures showing cost function
        visualize(300,200,xlims=(-4,6),ylims=(-6,6))
        outs = visualize(1200,1000,type='surf')

    if run_newton:
        xgarray = np.array([[2,2],[20,2],[200,2],[2000,2]])
        niter = []
        for xg in xgarray:
            _,_,temp = newton_test(xg)
            niter = niter + [temp]
        plt.figure()
        plt.semilogx(xgarray[:,0],niter,'x')
        plt.xlabel('$x_{1,g}$')
        plt.ylabel('Iterations for convergence')
        plt.title('Number of iterations required by Newton method, $x_{2,g}=2$')
        plt.show()
    if run_bd:
        bracket_descent_test([-1,-3])

    #Add code here to call performance
    if run_performance:
        performance()

and the Fortran module, hw2soln.f90:

!Module for solving n-d optimization problems with Newton's method and 2-d problems
!with bracket descent. Necessary cost function details are provided in separate cost
!module.
module hw2soln
  use cost_soln
  implicit none
  integer :: itermax = 1000 !maximum number of iterations used by an optimizer
  real(kind=8) :: tol=1.0e-6 !stopping criteria for optimizer
  real(kind=8), allocatable :: jpath(:), xpath(:,:) !allocatable 1-d and 2-d arrays, should contain cost and location values used during optimization iterations
  integer :: convergence_method = 1
  contains

  subroutine newton(xguess,xf,jf)
    !Use Newton's method to minimize cost function, costj
    !input: xguess -- initial guess for loaction of minimum
    !output: xf -- computed location of minimum, jf -- computed minimum
    !Should also set module variables xpath and jpath appropriately
    implicit none
    integer :: ndim
    real(kind=8), dimension(:), intent(in) :: xguess !do not need to explicitly specify dimension of input variable when subroutine is within a module
    real(kind=8), intent(out) :: xf(size(xguess)),jf !location of minimum, minimum cost
    logical :: convergence_flag
    integer :: i1
    !dsysv variables
    integer :: NRHS,LDA,LDB,INFO,lwork
    real(kind=8) :: j_old
    real(kind=8), allocatable, dimension(:) :: work,jtemp
    real(kind=8), allocatable, dimension(:,:) :: xtemp,HH,g
    integer, allocatable, dimension(:) :: IPIV

    ndim = size(xguess)
    allocate(xtemp(itermax+1,ndim),jtemp(itermax+1))

    !setup dsysv variables
    NRHS = 1
    LDA = ndim
    LDB = ndim
    !allocate arrays
    lwork = ndim**2
    allocate(HH(LDA,ndim),g(LDB,NRHS),IPIV(ndim),work(lwork))

    xf = xguess
    call costj(xf,jf)
    i1=0
    xtemp(i1+1,:) = xf
    jtemp(i1+1) = jf

    convergence_flag = .False.

    !iterate using Newton's method
    do while(i1<itermax .and. (.not. convergence_flag))

      !compute Hessian, gradient
      call costj_grad2d(xf,g)
      call costj_hess2d(xf,HH)

      !compute step, update coordinate
      call dsysv('U',ndim, NRHS, HH, LDA, IPIV, g, LDB, work,lwork,INFO)
      xf = xf - g(1:ndim,1)
      i1 = i1 + 1
      xtemp(i1+1,:) = xf
      j_old = jf
      call costj(xf,jf)
      jtemp(i1+1) = jf
      call convergence_check(j_old,jf,convergence_flag)
    end do

    !set xpath jpath
    if (allocated(xpath)) deallocate(xpath)
    if (allocated(jpath)) deallocate(jpath)

    allocate(xpath(i1+1,ndim),jpath(i1+1))
    xpath = xtemp(1:i1+1,:)
    jpath = jtemp(1:i1+1)

  end subroutine newton


  subroutine bracket_descent(xguess,xf,jf)
    !Use bracket descent method to minimize cost function, costj
    !input: xguess -- initial guess for loaction of minimum
    !output: xf -- computed location of minimum, jf -- computed minimum
    !Should also set module variables xpath and jpath appropriately
    !Assumes size(xguess) = 2
    implicit none
    real(kind=8), dimension(2), intent(in) :: xguess
    real(kind=8), intent(out) :: xf(2),jf !location of minimum, minimum cost
    integer :: i1,k1
    real(kind=8) :: j3(3),x3(3,2),j3temp(3),x3temp(3,2),xtemp(itermax+1,2),jtemp(itermax+1)
    real(kind=8) :: xm(2),dx(2),xr(2),jr,j_old
    logical :: convergence_flag

    !initialize triangle, costs
    call bd_initialize(xguess,x3,j3)
    call bd_sort(x3,j3,x3temp,j3temp)
    j3temp = j3
    x3temp = x3
    i1=0
    xtemp(i1+1,:) = sum(x3,dim=1)/3
    call costj(xtemp(i1+1,:),j_old)
    jtemp(i1+1) = j_old
    j_old = sum(abs(j3))
    convergence_flag = .False.


    !Iterate using b-d scheme
    do while(i1<itermax .and. (.not. convergence_flag))
      !reflect
      xm = 0.5d0*(x3(1,:)+x3(2,:))
      dx = x3(3,:)-xm
      xf = xm-dx

      call costj(xf,jf)

      if (jf<j3(3)) then
        if (jf>j3(1)) then !accept new vertex
            j3(3) = jf
            x3(3,:) = xf
        else !extend

          xr = xm-2*dx
          call costj(xr,jr)
          if (jr<jf) then !accept extension
            j3(3) = jr
            x3(3,:) = xr
          else !accept original reflection
            j3(3) = jf
            x3(3,:) = xf
          end if
        end if
      else
        xf = xm - dx/2 !contract
        call costj(xf,jf)
        if (jf<j3(3)) then !accept contraction
          j3(3) = jf
          x3(3,:) = xf
        else !contract further
          xf = xm + dx/2
          call costj(xf,jf)
          if (jf<j3(3)) then !accept 2nd contraction
            j3(3) = jf
            x3(3,:) = xf
          else !shrink
            do k1=2,3
              x3(k1,:) = 0.5d0*(x3(1,:)+x3(k1,:))
              xf = x3(k1,:)
              call costj(xf,jf)
              j3(k1) = jf
            end do
          end if
        end if
      end if

      !compute centroid, sort vertices, check for convergence
      i1 = i1 + 1
      xtemp(i1+1,:) = sum(x3,dim=1)/3
      xf = xtemp(i1+1,:)
      call costj(xf,jf)
      jtemp(i1+1) = jf
      call bd_sort(x3,j3,x3temp,j3temp)
      x3 = x3temp
      j3 = j3temp
      jf = sum(abs(j3))
      if (convergence_method==1) then
        call convergence_check(j_old,jf,convergence_flag)
      else
        if (minval(j3)<=tol) convergence_flag = .True.
      end if
      j_old = jf
    end do

    !set xpath, jpath
    if (allocated(xpath)) deallocate(xpath)
    if (allocated(jpath)) deallocate(jpath)
    allocate(xpath(i1,2),jpath(i1))
    xpath = xtemp(1:i1,:)
    jpath = jtemp(1:i1)
    xf = x3(1,:)
    jf = j3(1)


  end subroutine bracket_descent


  subroutine bd_initialize(xguess,x3,j3)
    !given xguess, generates vertices (x3) and corresponding costs (j3) for initial
    !bracket descent step
    implicit none
    real(kind=8), intent(in) :: xguess(2)
    real(kind=8), intent(out) :: j3(3),x3(3,2) !location of minimum
    integer :: i1
    real(kind=8), parameter :: l=1.d0

    x3(1,1) = xguess(1)
    x3(2,1) = xguess(1)+l*sqrt(3.d0)/2
    x3(3,1) = xguess(1)-l*sqrt(3.d0)/2
    x3(1,2) = xguess(2)+l
    x3(2,2) = xguess(2)-l/2
    x3(3,2) = xguess(2)-l/2

    do i1=1,3
      call costj(x3(i1,:),j3(i1))
    end do
  end subroutine bd_initialize

  subroutine bd_sort(x,c,xs,cs)
    !Three element insertion sort
    implicit none
    real(kind=8), intent(in) :: c(3),x(3,2)
    real(kind=8), intent(out) :: cs(3),xs(3,2)
    real(kind=8), dimension(2) :: temp

    cs = c
    xs = x
    if (c(2)<c(1)) then
      cs(1) = c(2)
      cs(2) = c(1)
      xs(1,:) = x(2,:)
      xs(2,:) = x(1,:)
    end if

    if (c(3)<cs(2)) then
      cs(3) = cs(2)
      cs(2) = c(3)
      xs(3,:) = xs(2,:)
      xs(2,:) = x(3,:)

      if (c(3)<cs(1)) then
        temp(1) = cs(1)
        cs(1) = cs(2)
        cs(2) = temp(1)
        temp = xs(1,:)
        xs(1,:) = xs(2,:)
        xs(2,:) = temp
      end if
    end if

  end subroutine bd_sort


  subroutine convergence_check(j1,j2,flag_converged)
    !check if costs j1 and j2 satisfy convergence criteria
    implicit none
    real(kind=8), intent(in) :: j1,j2
    real(kind=8) :: test
    logical, intent(out) :: flag_converged

    test = abs(j1-j2)/max(abs(j1),abs(j2),1.d0)
    if (test .le. tol) then
      flag_converged = .True.
    else
      flag_converged = .False.
    end if
  end subroutine convergence_check



end module hw2soln

1. (10 pts) The cost function is shown with and without added noise in the figures below. The sharp gradients and large range of amplitudes suggest that a logarithmic scale should be used:

../_images/hw212.png ../_images/hw211.png ../_images/hw212s.png ../_images/hw211s.png
  1. Newton’s method (25 pts)

i) See solution code

ii) The trajectory of the estimate for the minimizer and the reduction in the cost function are shown in the figure below. Newton’s method is extremely effective for this cost function (even with guesses very far from the solution) indicating that a locally-quadratic approximation of the cost function is highly-accurate.

../_images/hw221soln.png ../_images/hw222soln.png
  1. Bracket-descent optimization (40 pts)

i) See solution code

  1. From question 2, we know that Newton’s method converges extremely
quickly for this cost function for a variety of guesses, so comparing the b-d and Newton steps allows us to evaluate the quality of the b-d algorithm. The details of the comparison will depend on the initial guess but broadly, as illustrated in the figures belows, there is an initial adjustment period where b-d steps are smaller and in a different direction from Newton steps. Subsequently, the angles tend to align, and amplitudes reduce at similar rates. Close to the minimum, the step-size decreases exponentially. Occasionally, the b-d method moves in the ‘wrong’ direction suggesting a deficiency in the algorithm.
../_images/hw231s.png ../_images/hw232s.png
  1. Comparing bracket descent and L-BFGS-B (25 pts)

i) (10 pts) As seen in the first figure below, B-D generally requires more iterations than L-BFGS-B, and specifying the exact gradient of the cost function does accelerate convergence as expected. However, the B-D method is much faster (second figure) as it does not require the gradient, estimation of the inverse Hessian, or the matrix-vector product of the two. There is some uncertainty here as we do not know the quality of the scipy implementation, however we can speculate that optimizations carried out by the Fortran compiler are also contributing to these trends.

../_images/hw241.png ../_images/hw242.png
  1. With added noise, The trends are very complicated. For noise levels less than about 1e-4
we see results very similar to what is observed for noise-free calculations. For noise levels ~O(1), neither method is reliable. The b-d method “converges” to a local minimum which can be larger than the value produced by minimize even when minimize doesn’t converge. For intermediate noise amplitudes, there is strong sensitivity to the guess, with minimize tending not to converge as the guess moves away from (1,1). B-D again typically does converge, but the estimated minimum is not reliable. Original convergence method is used here rather than the bespoke method used for the no-noise case. Not required: A simple fix to improve b-d is to increase size of initial triangle as noise amplitude increases.
../_images/hw243s.png

iii) Possible added features include: input checking with assert statements, proper test functions for the new optimizers suitable for unit testing, including option of outputting optimizer iterations and progress to screen.