[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Faster subsassgn (Was: Five other functions that are in Matlab core port
From: |
David Bateman |
Subject: |
Faster subsassgn (Was: Five other functions that are in Matlab core ported to Octave) |
Date: |
Sat, 08 Sep 2007 00:23:18 +0200 |
User-agent: |
Thunderbird 1.5.0.7 (X11/20060921) |
There was an error in my previous test script as "A([1:n],i)" is an
entirely different case than "A(1:n,i)". In the first case all of the
element f the matrix used to index A must be check that they are valid,
and in the second case the fact that a Range is used can accelerate the
process. I attach an updated script that compares indexing like "A(:,i)"
against "A(1:n,i)". With the patch I sent previously this gives
octave:1> [t1,t2] = testsubsassgn([100,200,500,1000],[100,200,500,1000],5)
t1 =
3.9980e-04 8.0000e-04 1.7996e-03 3.9994e-03
4.0000e-04 9.9980e-04 2.1996e-03 4.5994e-03
6.0000e-04 1.1998e-03 3.3994e-03 6.1992e-03
1.0000e-03 1.9998e-03 4.5994e-03 9.3986e-03
t2 =
6.0000e-04 9.9980e-04 2.1996e-03 4.3994e-03
8.0000e-04 9.9980e-04 2.7996e-03 5.3992e-03
1.1998e-03 1.5998e-03 3.7994e-03 7.7988e-03
1.1998e-03 2.5996e-03 6.1992e-03 1.1998e-02
without the patch and
octave:1> [t1,t2] = testsubsassgn([100,200,500,1000],[100,200,500,1000],5)
t1 =
3.9980e-04 5.9980e-04 1.5998e-03 3.3994e-03
3.9980e-04 7.9980e-04 1.5998e-03 3.7992e-03
4.0000e-04 7.9980e-04 2.1998e-03 4.5994e-03
3.9980e-04 1.3998e-03 2.7996e-03 6.1990e-03
t2 =
6.0000e-04 8.0000e-04 1.9996e-03 4.1994e-03
8.0000e-04 1.1998e-03 2.5996e-03 5.1992e-03
4.0000e-04 1.5998e-03 3.3994e-03 6.5990e-03
1.1998e-03 1.9996e-03 4.5992e-03 9.3986e-03
with it.. A bit of further profiling shows that the second case is
spending much longer than needed in the
idx_vector::idx_vector_rep::init_state method. Each time "A(1:n,i)" is
called, the max/min elements of 1:n and the number of ones and zeros are
calculated. As we are dealing with a Range this can be accelerated
significantly without the need to check all of the values. Making such a
change gives the result
octave:1> [t1,t2] = testsubsassgn([100,200,500,1000],[100,200,500,1000],5)
t1 =
4.0000e-04 5.9980e-04 1.5998e-03 3.3994e-03
3.9980e-04 5.9980e-04 1.7998e-03 3.7994e-03
5.9980e-04 9.9980e-04 2.1996e-03 4.5992e-03
4.0000e-04 1.1998e-03 2.9994e-03 5.9992e-03
t2 =
4.0000e-04 8.0000e-04 1.5998e-03 3.3996e-03
4.0000e-04 7.9980e-04 1.9996e-03 3.9994e-03
6.0000e-04 7.9980e-04 2.1996e-03 4.1994e-03
7.9980e-04 9.9980e-04 2.3996e-03 4.5994e-03
or almost a further doubling in the speed of "A(1:n,i)" indexing!!! I
also ran "make check" with the attached patch with no issue. Given the
speed-ups it offers and the minimum intrusiveness it seems like a
reasonable patch to apply.
D.
Index: liboctave/Array.cc
===================================================================
RCS file: /usr/local/cvsroot/octave/liboctave/Array.cc,v
retrieving revision 1.159
diff -c -r1.159 Array.cc
*** liboctave/Array.cc 6 Sep 2007 12:08:44 -0000 1.159
--- liboctave/Array.cc 7 Sep 2007 21:40:31 -0000
***************
*** 1355,1361 ****
iidx++;
else
{
! new_data[ii] = elem (i);
ii++;
}
}
--- 1355,1361 ----
iidx++;
else
{
! new_data[ii] = xelem (i);
ii++;
}
}
***************
*** 1441,1447 ****
iidx++;
else
{
! new_data[ii] = elem (i);
ii++;
}
--- 1441,1447 ----
iidx++;
else
{
! new_data[ii] = xelem (i);
ii++;
}
***************
*** 1558,1564 ****
else
{
for (octave_idx_type i = 0; i < nr; i++)
! new_data[nr*jj+i] = elem (i, j);
jj++;
}
}
--- 1558,1564 ----
else
{
for (octave_idx_type i = 0; i < nr; i++)
! new_data[nr*jj+i] = xelem (i, j);
jj++;
}
}
***************
*** 1621,1627 ****
else
{
for (octave_idx_type j = 0; j < nc; j++)
! new_data[new_nr*j+ii] = elem (i, j);
ii++;
}
}
--- 1621,1627 ----
else
{
for (octave_idx_type j = 0; j < nc; j++)
! new_data[new_nr*j+ii] = xelem (i, j);
ii++;
}
}
***************
*** 1919,1925 ****
octave_idx_type kidx
= ::compute_index (temp_result_idx,
new_lhs_dim);
! new_data[kidx] = elem (result_idx);
}
increment_index (result_idx, lhs_dims);
--- 1919,1925 ----
octave_idx_type kidx
= ::compute_index (temp_result_idx,
new_lhs_dim);
! new_data[kidx] = xelem (result_idx);
}
increment_index (result_idx, lhs_dims);
***************
*** 1977,1983 ****
}
else
{
! new_data[ii++] = elem (lhs_ra_idx);
}
increment_index (lhs_ra_idx, lhs_dims);
--- 1977,1983 ----
}
else
{
! new_data[ii++] = xelem (lhs_ra_idx);
}
increment_index (lhs_ra_idx, lhs_dims);
***************
*** 2506,2511 ****
--- 2506,2513 ----
}
else
{
+ lhs.make_unique ();
+
if (rhs_len == n || rhs_len == 1)
{
octave_idx_type max_idx = lhs_idx.max () + 1;
***************
*** 2515,2534 ****
if (rhs_len == n)
{
! for (octave_idx_type i = 0; i < n; i++)
{
! octave_idx_type ii = lhs_idx.elem (i);
! lhs.elem (ii) = rhs.elem (i);
}
}
else if (rhs_len == 1)
{
RT scalar = rhs.elem (0);
! for (octave_idx_type i = 0; i < n; i++)
{
! octave_idx_type ii = lhs_idx.elem (i);
! lhs.elem (ii) = scalar;
}
}
else
--- 2517,2552 ----
if (rhs_len == n)
{
! if (lhs_idx.is_colon ())
{
! for (octave_idx_type i = 0; i < n; i++)
! lhs.xelem (i) = rhs.elem (i);
! }
! else
! {
! for (octave_idx_type i = 0; i < n; i++)
! {
! octave_idx_type ii = lhs_idx.elem (i);
! lhs.xelem (ii) = rhs.elem (i);
! }
}
}
else if (rhs_len == 1)
{
RT scalar = rhs.elem (0);
! if (lhs_idx.is_colon ())
{
! for (octave_idx_type i = 0; i < n; i++)
! lhs.xelem (i) = scalar;
! }
! else
! {
! for (octave_idx_type i = 0; i < n; i++)
! {
! octave_idx_type ii = lhs_idx.elem (i);
! lhs.xelem (ii) = scalar;
! }
}
}
else
***************
*** 2546,2555 ****
if (lhs_dims.all_zero ())
{
lhs.resize_no_fill (rhs_len);
for (octave_idx_type i = 0; i < rhs_len; i++)
! lhs.elem (i) = rhs.elem (i);
}
else if (rhs_len != lhs_len)
(*current_liboctave_error_handler)
--- 2564,2575 ----
if (lhs_dims.all_zero ())
{
+ lhs.make_unique ();
+
lhs.resize_no_fill (rhs_len);
for (octave_idx_type i = 0; i < rhs_len; i++)
! lhs.xelem (i) = rhs.elem (i);
}
else if (rhs_len != lhs_len)
(*current_liboctave_error_handler)
***************
*** 2669,2674 ****
--- 2689,2696 ----
if (n > 0 && m > 0)
{
+ lhs.make_unique ();
+
MAYBE_RESIZE_LHS;
RT scalar = xrhs.elem (0, 0);
***************
*** 2679,2685 ****
for (octave_idx_type i = 0; i < n; i++)
{
octave_idx_type ii = idx_i.elem (i);
! lhs.elem (ii, jj) = scalar;
}
}
}
--- 2701,2707 ----
for (octave_idx_type i = 0; i < n; i++)
{
octave_idx_type ii = idx_i.elem (i);
! lhs.xelem (ii, jj) = scalar;
}
}
}
***************
*** 2688,2693 ****
--- 2710,2717 ----
&& (rhs_nr == 1 || rhs_nc == 1)
&& n * m == rhs_nr * rhs_nc)
{
+ lhs.make_unique ();
+
MAYBE_RESIZE_LHS;
if (n > 0 && m > 0)
***************
*** 2700,2712 ****
for (octave_idx_type i = 0; i < n; i++)
{
octave_idx_type ii = idx_i.elem (i);
! lhs.elem (ii, jj) = xrhs.elem (k++);
}
}
}
}
else if (n == rhs_nr && m == rhs_nc)
{
MAYBE_RESIZE_LHS;
if (n > 0 && m > 0)
--- 2724,2738 ----
for (octave_idx_type i = 0; i < n; i++)
{
octave_idx_type ii = idx_i.elem (i);
! lhs.xelem (ii, jj) = xrhs.elem (k++);
}
}
}
}
else if (n == rhs_nr && m == rhs_nc)
{
+ lhs.make_unique ();
+
MAYBE_RESIZE_LHS;
if (n > 0 && m > 0)
***************
*** 2717,2723 ****
for (octave_idx_type i = 0; i < n; i++)
{
octave_idx_type ii = idx_i.elem (i);
! lhs.elem (ii, jj) = xrhs.elem (i, j);
}
}
}
--- 2743,2749 ----
for (octave_idx_type i = 0; i < n; i++)
{
octave_idx_type ii = idx_i.elem (i);
! lhs.xelem (ii, jj) = xrhs.elem (i, j);
}
}
}
***************
*** 2862,2881 ****
}
else if (len == rhs_nr * rhs_nc)
{
! for (octave_idx_type i = 0; i < len; i++)
{
! octave_idx_type ii = idx_i.elem (i);
! lhs.elem (ii) = xrhs.elem (i);
}
}
else if (rhs_is_scalar)
{
RT scalar = rhs.elem (0, 0);
! for (octave_idx_type i = 0; i < len; i++)
{
! octave_idx_type ii = idx_i.elem (i);
! lhs.elem (ii) = scalar;
}
}
else
--- 2888,2927 ----
}
else if (len == rhs_nr * rhs_nc)
{
! lhs.make_unique ();
!
! if (idx_i.is_colon ())
! {
! for (octave_idx_type i = 0; i < len; i++)
! lhs.xelem (i) = xrhs.elem (i);
! }
! else
{
! for (octave_idx_type i = 0; i < len; i++)
! {
! octave_idx_type ii = idx_i.elem (i);
! lhs.xelem (ii) = xrhs.elem (i);
! }
}
}
else if (rhs_is_scalar)
{
+ lhs.make_unique ();
+
RT scalar = rhs.elem (0, 0);
! if (idx_i.is_colon ())
{
! for (octave_idx_type i = 0; i < len; i++)
! lhs.xelem (i) = scalar;
! }
! else
! {
! for (octave_idx_type i = 0; i < len; i++)
! {
! octave_idx_type ii = idx_i.elem (i);
! lhs.xelem (ii) = scalar;
! }
}
}
else
***************
*** 2934,2941 ****
else if (n_idx == 1)
{
idx_vector iidx = idx(0);
! if (! (iidx.is_colon ()
|| (iidx.one_zero_only ()
&& iidx.orig_dimensions () == lhs.dims ())))
(*current_liboctave_warning_with_id_handler)
--- 2980,2988 ----
else if (n_idx == 1)
{
idx_vector iidx = idx(0);
+ int iidx_is_colon = iidx.is_colon ();
! if (! (iidx_is_colon
|| (iidx.one_zero_only ()
&& iidx.orig_dimensions () == lhs.dims ())))
(*current_liboctave_warning_with_id_handler)
***************
*** 2959,2980 ****
}
else if (len == rhs.length ())
{
! for (octave_idx_type i = 0; i < len; i++)
{
! octave_idx_type ii = iidx.elem (i);
! lhs.elem (ii) = rhs.elem (i);
}
}
else if (rhs_is_scalar)
{
RT scalar = rhs.elem (0);
! for (octave_idx_type i = 0; i < len; i++)
{
! octave_idx_type ii = iidx.elem (i);
! lhs.elem (ii) = scalar;
}
}
else
--- 3006,3047 ----
}
else if (len == rhs.length ())
{
! lhs.make_unique ();
!
! if (iidx_is_colon)
{
! for (octave_idx_type i = 0; i < len; i++)
! lhs.xelem (i) = rhs.elem (i);
! }
! else
! {
! for (octave_idx_type i = 0; i < len; i++)
! {
! octave_idx_type ii = iidx.elem (i);
! lhs.xelem (ii) = rhs.elem (i);
! }
}
}
else if (rhs_is_scalar)
{
RT scalar = rhs.elem (0);
! lhs.make_unique ();
!
! if (iidx_is_colon)
{
! for (octave_idx_type i = 0; i < len; i++)
! lhs.xelem (i) = scalar;
! }
! else
! {
! for (octave_idx_type i = 0; i < len; i++)
! {
! octave_idx_type ii = iidx.elem (i);
! lhs.xelem (ii) = scalar;
! }
}
}
else
***************
*** 3131,3136 ****
--- 3198,3205 ----
if (rhs_is_scalar)
{
+ lhs.make_unique ();
+
if (n_idx < orig_lhs_dims_len)
lhs = lhs.reshape (lhs_dims);
***************
*** 3146,3156 ****
octave_idx_type len = frozen_len(0);
! for (octave_idx_type i = 0; i < len; i++)
{
! octave_idx_type ii = iidx.elem (i);
! lhs.elem (ii) = scalar;
}
}
else if (lhs_dims_len == 2 && n_idx == 2)
--- 3215,3233 ----
octave_idx_type len = frozen_len(0);
! if (iidx.is_colon ())
{
! for (octave_idx_type i = 0; i < len; i++)
! lhs.xelem (i) = scalar;
! }
! else
! {
! for (octave_idx_type i = 0; i < len; i++)
! {
! octave_idx_type ii = iidx.elem (i);
! lhs.xelem (ii) = scalar;
! }
}
}
else if (lhs_dims_len == 2 && n_idx == 2)
***************
*** 3161,3173 ****
octave_idx_type i_len = frozen_len(0);
octave_idx_type j_len = frozen_len(1);
! for (octave_idx_type j = 0; j < j_len; j++)
{
! octave_idx_type jj = idx_j.elem (j);
! for (octave_idx_type i = 0; i < i_len; i++)
{
! octave_idx_type ii = idx_i.elem (i);
! lhs.elem (ii, jj) = scalar;
}
}
}
--- 3238,3264 ----
octave_idx_type i_len = frozen_len(0);
octave_idx_type j_len = frozen_len(1);
! if (idx_i.is_colon())
{
! for (octave_idx_type j = 0; j < j_len; j++)
{
! octave_idx_type off = new_dims (0) *
! idx_j.elem (j);
! for (octave_idx_type i = 0; i < i_len; i++)
! lhs.xelem (i + off) = scalar;
! }
! }
! else
! {
! for (octave_idx_type j = 0; j < j_len; j++)
! {
! octave_idx_type off = new_dims (0) *
! idx_j.elem (j);
! for (octave_idx_type i = 0; i < i_len; i++)
! {
! octave_idx_type ii = idx_i.elem (i);
! lhs.xelem (ii + off) = scalar;
! }
}
}
}
***************
*** 3181,3187 ****
{
Array<octave_idx_type> elt_idx = get_elt_idx
(idx, result_idx);
! lhs.elem (elt_idx) = scalar;
increment_index (result_idx, frozen_len);
}
--- 3272,3278 ----
{
Array<octave_idx_type> elt_idx = get_elt_idx
(idx, result_idx);
! lhs.xelem (elt_idx) = scalar;
increment_index (result_idx, frozen_len);
}
***************
*** 3203,3208 ****
--- 3294,3301 ----
}
else
{
+ lhs.make_unique ();
+
if (n_idx < orig_lhs_dims_len)
lhs = lhs.reshape (lhs_dims);
***************
*** 3216,3226 ****
octave_idx_type len = frozen_len(0);
! for (octave_idx_type i = 0; i < len; i++)
{
! octave_idx_type ii = iidx.elem (i);
! lhs.elem (ii) = rhs.elem (i);
}
}
else if (lhs_dims_len == 2 && n_idx == 2)
--- 3309,3327 ----
octave_idx_type len = frozen_len(0);
! if (iidx.is_colon ())
{
! for (octave_idx_type i = 0; i < len; i++)
! lhs.xelem (i) = rhs.elem (i);
! }
! else
! {
! for (octave_idx_type i = 0; i < len; i++)
! {
! octave_idx_type ii = iidx.elem (i);
! lhs.xelem (ii) = rhs.elem (i);
! }
}
}
else if (lhs_dims_len == 2 && n_idx == 2)
***************
*** 3232,3246 ****
octave_idx_type j_len = frozen_len(1);
octave_idx_type k = 0;
! for (octave_idx_type j = 0; j < j_len; j++)
{
! octave_idx_type jj = idx_j.elem (j);
! for (octave_idx_type i = 0; i < i_len; i++)
{
! octave_idx_type ii = idx_i.elem (i);
! lhs.elem (ii, jj) = rhs.elem (k++);
}
}
}
else
{
--- 3333,3363 ----
octave_idx_type j_len = frozen_len(1);
octave_idx_type k = 0;
! if (idx_i.is_colon())
{
! for (octave_idx_type j = 0; j < j_len; j++)
{
! octave_idx_type off = new_dims (0) *
! idx_j.elem (j);
! for (octave_idx_type i = 0;
! i < i_len; i++)
! lhs.xelem (i + off) = rhs.elem (k++);
}
}
+ else
+ {
+ for (octave_idx_type j = 0; j < j_len; j++)
+ {
+ octave_idx_type off = new_dims (0) *
+ idx_j.elem (j);
+ for (octave_idx_type i = 0; i < i_len;
i++)
+ {
+ octave_idx_type ii = idx_i.elem (i);
+ lhs.xelem (ii + off) = rhs.elem (k++);
+ }
+ }
+ }
+
}
else
{
***************
*** 3252,3258 ****
{
Array<octave_idx_type> elt_idx = get_elt_idx
(idx, result_idx);
! lhs.elem (elt_idx) = rhs.elem (i);
increment_index (result_idx, frozen_len);
}
--- 3369,3375 ----
{
Array<octave_idx_type> elt_idx = get_elt_idx
(idx, result_idx);
! lhs.xelem (elt_idx) = rhs.elem (i);
increment_index (result_idx, frozen_len);
}
Index: liboctave/Array.h
===================================================================
RCS file: /usr/local/cvsroot/octave/liboctave/Array.h,v
retrieving revision 1.100
diff -c -r1.100 Array.h
*** liboctave/Array.h 6 Sep 2007 12:08:45 -0000 1.100
--- liboctave/Array.h 7 Sep 2007 21:40:32 -0000
***************
*** 110,115 ****
--- 110,120 ----
//--------------------------------------------------------------------
+ public:
+
+ // !!! WARNING !!! -- these should be protected, not public. You
+ // should not access these methods directly!
+
void make_unique (void)
{
if (rep->count > 1)
***************
*** 130,137 ****
rep->fill (val);
}
- public:
-
typedef T element_type;
// !!! WARNING !!! -- these should be protected, not public. You
--- 135,140 ----
Index: liboctave/idx-vector.cc
===================================================================
RCS file: /usr/local/cvsroot/octave/liboctave/idx-vector.cc,v
retrieving revision 1.65
diff -c -r1.65 idx-vector.cc
*** liboctave/idx-vector.cc 26 Mar 2007 16:51:47 -0000 1.65
--- liboctave/idx-vector.cc 7 Sep 2007 21:40:32 -0000
***************
*** 197,203 ****
for (octave_idx_type i = 1; i < len; i++)
data[i] = data[i-1] + step;
! init_state ();
}
else
(*current_liboctave_error_handler)
--- 197,224 ----
for (octave_idx_type i = 1; i < len; i++)
data[i] = data[i-1] + step;
! // Don't use init_state(), as it can be vastly accelerated since
! // we don't have to search all values for max/min, etc.
! if (data[0] < data [len - 1])
! {
! min_val = data [0];
! max_val = data [len - 1];
! }
! else
! {
! min_val = data [len - 1];
! max_val = data [0];
! }
!
! octave_idx_type n_zero = - b / step;
! if ((n_zero == 0 && b == 0) ||
! (n_zero > 0 && (b % (n_zero * step)) == 0))
! num_zeros = 1;
!
! octave_idx_type n_one = (1 - b) / step;
! if ((n_one == 0 && b == 1) ||
! (n_one > 0 && (b % (n_one * step)) == 0))
! num_ones = 1;
}
else
(*current_liboctave_error_handler)
2007-09-06 David Bateman <address@hidden>
* Array.cc (void make_unique(void)): Make public so that the
::assign functions can access it directly.
* Array.cc (Array<T>::maybe_delete_elements_1(idx_vector&),
Array<T>::maybe_delete_elements_1(idx_vector&),
Array<T>::maybe_delete_elements(idx_vector&, idx_vector&),
Array<T>::maybe_delete_elements(Array<idx_vector>&, const T&)):
Use xelem for non const RHS to avoid call to make_unique.
(int assign1 (Array<LT>&, const Array<RT>&, const LT&)): Use
xelem for LHS and call lhs.make_unique() only once. Special case
the is_colon index case and use Array<T>::xelem(octave_idx_type)
rather than Array<T>::xelem(octave_idx_type,octave_idx_type) and
bring the additional multiplication out of the inner loop.
(int assign2 (Array<LT>&, const Array<RT>&, const LT&)): ditto.
(int assignN (Array<LT>&, const Array<RT>&, const LT&)): ditto.
* idx-vector.h (idx_vector::idx_vector_rep::idx_vector_rep
(const Range& r)): Don't use init_state() method but special case
as with a Range can avoid exhaustive search.
function [t1, t2] = testsubsassgn (m, n, nrun)
if (nargin < 3)
nrun = 3;
endif
if (nargin < 2)
n = [10, 100];
endif
if (nargin < 1)
m = [10, 100];
endif
t1 = zeros (length(m), length(n));
t2 = zeros (length(m), length(n));
for i = 1 : length(m)
col = ones (m(i), 1);
col1 = 1:m(i);
for j = 1 : length(n)
for k = 1 : nrun
b = zeros (m(i), n(j));
t0 = cputime ();
for l = 1 : n (j)
b(:,l) = col;
endfor
t1 (i, j) = cputime () - t0;
b = zeros(m(i), n(j));
t0 = cputime();
for l = 1 : n (j)
b(col1,l) = col;
endfor
t2 (i, j) = cputime () - t0;
endfor
t1 (i, j) /= nrun;
t2 (i, j) /= nrun;
endfor
endfor
endfunction