Optimize memcpy

This speeds up memcpy by copying a word at a time if source and destination are
aligned in mod 4. That is, if n and m are a positive integer:

  4n -> 4m: aligned, 4x speed.
  4n -> 4m+1: misaligned.
  4n+1 -> 4m+1: aligned in mod 4, 4x speed.

Ran the unit test on Peppy:

  > runtest
  ...
  Running test_memcpy... (speed gain: 120300 -> 38103 us) OK
  ...

Ran make buildall -j:

  ...
  Running test_memcpy... (speed gain: 2084 -> 549 us) OK
  ...

Note misaligned case is also optimized. Unit test runs in 298 us on Peppy while
it takes about 475 with the original memcpy.

TEST=Described above.
BUG=chrome-os-partner:23720
BRANCH=none
Signed-off-by: Daisuke Nojiri <dnojiri@chromium.org>
Change-Id: Ic12260451c5efd0896d6353017cd45d29cb672db
Tested-by: Daisuke Nojiri <dnojiri@google.com>
Reviewed-on: https://chromium-review.googlesource.com/185618
Reviewed-by: Randall Spangler <rspangler@chromium.org>
Reviewed-by: Vincent Palatin <vpalatin@chromium.org>
Commit-Queue: Daisuke Nojiri <dnojiri@google.com>
This commit is contained in:
Daisuke Nojiri
2014-02-08 17:03:15 -08:00
committed by chrome-internal-fetch
parent a78c59e4ac
commit d3facbd92f
2 changed files with 87 additions and 7 deletions

View File

@@ -173,16 +173,44 @@ int memcmp(const void *s1, const void *s2, int len)
void *memcpy(void *dest, const void *src, int len)
{
/*
* TODO(crosbug.com/p/23720): if src/dest are aligned, copy a word at a
* time instead.
*/
char *d = (char *)dest;
const char *s = (const char *)src;
while (len > 0) {
*(d++) = *(s++);
len--;
uint32_t *dw;
const uint32_t *sw;
char *head;
char * const tail = (char *)dest + len;
/* Set 'body' to the last word boundary */
uint32_t * const body = (uint32_t *)((uintptr_t)tail & ~3);
if (((uintptr_t)dest & 3) != ((uintptr_t)src & 3)) {
/* Misaligned. no body, no tail. */
head = tail;
} else {
/* Aligned */
if ((uintptr_t)tail < (((uintptr_t)d + 3) & ~3))
/* len is shorter than the first word boundary */
head = tail;
else
/* Set 'head' to the first word boundary */
head = (char *)(((uintptr_t)d + 3) & ~3);
}
/* Copy head */
while (d < head)
*(d++) = *(s++);
/* Copy body */
dw = (uint32_t *)d;
sw = (uint32_t *)s;
while (dw < body)
*(dw++) = *(sw++);
/* Copy tail */
d = (char *)dw;
s = (const char *)sw;
while (d < tail)
*(d++) = *(s++);
return dest;
}

View File

@@ -83,6 +83,57 @@ static int test_memmove(void)
return EC_SUCCESS;
}
static int test_memcpy(void)
{
int i;
timestamp_t t0, t1, t2, t3;
char *buf;
const int buf_size = 1000;
const int len = 400;
const int dest_offset = 500;
const int iteration = 1000;
TEST_ASSERT(shared_mem_acquire(buf_size, &buf) == EC_SUCCESS);
for (i = 0; i < len; ++i)
buf[i] = i & 0x7f;
for (i = len; i < buf_size; ++i)
buf[i] = 0;
t0 = get_time();
for (i = 0; i < iteration; ++i)
memcpy(buf + dest_offset + 1, buf, len); /* unaligned */
t1 = get_time();
TEST_ASSERT_ARRAY_EQ(buf + dest_offset + 1, buf, len);
ccprintf(" (speed gain: %d ->", t1.val-t0.val);
t2 = get_time();
for (i = 0; i < iteration; ++i)
memcpy(buf + dest_offset, buf, len); /* aligned */
t3 = get_time();
ccprintf(" %d us) ", t3.val-t2.val);
TEST_ASSERT_ARRAY_EQ(buf + dest_offset, buf, len);
/* Expected about 4x speed gain. Use 3x because it fluctuates */
TEST_ASSERT((t1.val-t0.val) > (t3.val-t2.val) * 3);
memcpy(buf + dest_offset + 1, buf + 1, len - 1);
TEST_ASSERT_ARRAY_EQ(buf + dest_offset + 1, buf + 1, len - 1);
/* Test small copies */
memcpy(buf + dest_offset, buf, 1);
TEST_ASSERT_ARRAY_EQ(buf + dest_offset, buf, 1);
memcpy(buf + dest_offset, buf, 4);
TEST_ASSERT_ARRAY_EQ(buf + dest_offset, buf, 4);
memcpy(buf + dest_offset + 1, buf, 1);
TEST_ASSERT_ARRAY_EQ(buf + dest_offset + 1, buf, 1);
memcpy(buf + dest_offset + 1, buf, 4);
TEST_ASSERT_ARRAY_EQ(buf + dest_offset + 1, buf, 4);
shared_mem_release(buf);
return EC_SUCCESS;
}
static int test_strzcpy(void)
{
char dest[10];
@@ -305,6 +356,7 @@ void run_test(void)
RUN_TEST(test_strtoi);
RUN_TEST(test_parse_bool);
RUN_TEST(test_memmove);
RUN_TEST(test_memcpy);
RUN_TEST(test_strzcpy);
RUN_TEST(test_strlen);
RUN_TEST(test_strcasecmp);