1 /* Optimized memset implementation for PowerPC32/POWER7.
2 Copyright (C) 2010-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
21 /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
32 mr 10,3 /* Save original argument for later. */
33 mr 7,1 /* Save original r1 for later. */
36 /* Replicate byte to word. */
40 ble cr6,L(small) /* If length <= 8, use short copy code. */
43 ble cr7,L(medium) /* If length < 32, use medium copy code. */
45 /* Save our word twice to create a doubleword that we will later
48 andi. 11,10,7 /* Check alignment of DST. */
58 /* Get DST aligned to 8 bytes. */
67 4: bf 29,L(big_aligned)
79 srwi 9,5,3 /* Number of full doublewords remaining. */
84 /* From this point on, we'll copy 32+ bytes and the value
85 isn't 0 (so we can't use dcbz). */
93 /* Copy 1~3 doublewords so the main loop starts
94 at a multiple of 32 bytes. */
109 blt cr1,L(tail_bytes)
114 1: /* Copy 1 doubleword. */
121 /* First use a 32-bytes loop with stw's to try and avoid the LHS due
122 to the lfd we will do next. Also, ping-pong through r10 and r12
123 to avoid AGEN delays. */
146 bdnz L(big_loop_fast_setup)
151 /* Now that we're probably past the LHS window, use the VSX to
152 speed up the loop. */
153 L(big_loop_fast_setup):
168 bdnz L(big_loop_fast)
175 /* Check for tail bytes. */
176 mr 1,7 /* Restore r1. */
182 /* At this point we have a tail of 0-7 bytes and we know that the
183 destination is doubleword-aligned. */
184 4: /* Copy 4 bytes. */
189 2: /* Copy 2 bytes. */
194 1: /* Copy 1 byte. */
201 /* Special case when value is 0 and we have a long length to deal
202 with. Use dcbz to zero out 128-bytes at a time. Before using
203 dcbz though, we need to get the destination 128-bytes aligned. */
216 /* Get DST aligned to 128 bytes. */
243 1: bf 31,L(huge_aligned)
254 /* Copies 128-bytes at a time. */
261 /* We have a tail of 0~127 bytes to handle. */
262 mr 1,7 /* Restore r1. */
271 /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
272 speed. We'll handle the resulting tail bytes later. */
306 /* Handle the rest of the tail bytes here. */
327 /* Expanded tree to copy tail bytes without increments. */
362 /* Handle copies of 9~31 bytes. */
365 /* At least 9 bytes to go. */
368 beq L(medium_aligned)
370 /* Force 4-bytes alignment for DST. */
373 1: /* Copy 1 byte. */
378 2: /* Copy 2 bytes. */
379 bf 30,L(medium_aligned)
386 /* At least 6 bytes to go, and DST is word-aligned. */
397 8: /* Copy 8 bytes. */
403 4: /* Copy 4 bytes. */
408 2: /* Copy 2-3 bytes. */
413 1: /* Copy 1 byte. */
419 /* Handles copies of 0~8 bytes. */
430 libc_hidden_builtin_def (memset)