blob: f35c5fb5deff9ab3c1122ec196e76acc5cdad7c9 [file] [log] [blame]
Austin Schuhbb1338c2024-06-15 19:31:16 -07001/* mpn_mulmid -- middle product
2
3 Contributed by David Harvey.
4
5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9Copyright 2011 Free Software Foundation, Inc.
10
11This file is part of the GNU MP Library.
12
13The GNU MP Library is free software; you can redistribute it and/or modify
14it under the terms of either:
15
16 * the GNU Lesser General Public License as published by the Free
17 Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19
20or
21
22 * the GNU General Public License as published by the Free Software
23 Foundation; either version 2 of the License, or (at your option) any
24 later version.
25
26or both in parallel, as here.
27
28The GNU MP Library is distributed in the hope that it will be useful, but
29WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31for more details.
32
33You should have received copies of the GNU General Public License and the
34GNU Lesser General Public License along with the GNU MP Library. If not,
35see https://www.gnu.org/licenses/. */
36
37
38#include "gmp-impl.h"
39
40
41#define CHUNK (200 + MULMID_TOOM42_THRESHOLD)
42
43
44void
45mpn_mulmid (mp_ptr rp,
46 mp_srcptr ap, mp_size_t an,
47 mp_srcptr bp, mp_size_t bn)
48{
49 mp_size_t rn, k;
50 mp_ptr scratch, temp;
51
52 ASSERT (an >= bn);
53 ASSERT (bn >= 1);
54 ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, ap, an));
55 ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, bp, bn));
56
57 if (bn < MULMID_TOOM42_THRESHOLD)
58 {
59 /* region not tall enough to make toom42 worthwhile for any portion */
60
61 if (an < CHUNK)
62 {
63 /* region not too wide either, just call basecase directly */
64 mpn_mulmid_basecase (rp, ap, an, bp, bn);
65 return;
66 }
67
68 /* Region quite wide. For better locality, use basecase on chunks:
69
70 AAABBBCC..
71 .AAABBBCC.
72 ..AAABBBCC
73 */
74
75 k = CHUNK - bn + 1; /* number of diagonals per chunk */
76
77 /* first chunk (marked A in the above diagram) */
78 mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
79
80 /* remaining chunks (B, C, etc) */
81 an -= k;
82
83 while (an >= CHUNK)
84 {
85 mp_limb_t t0, t1, cy;
86 ap += k, rp += k;
87 t0 = rp[0], t1 = rp[1];
88 mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
89 ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */
90 MPN_INCR_U (rp + 1, k + 1, t1 + cy);
91 an -= k;
92 }
93
94 if (an >= bn)
95 {
96 /* last remaining chunk */
97 mp_limb_t t0, t1, cy;
98 ap += k, rp += k;
99 t0 = rp[0], t1 = rp[1];
100 mpn_mulmid_basecase (rp, ap, an, bp, bn);
101 ADDC_LIMB (cy, rp[0], rp[0], t0);
102 MPN_INCR_U (rp + 1, an - bn + 2, t1 + cy);
103 }
104
105 return;
106 }
107
108 /* region is tall enough for toom42 */
109
110 rn = an - bn + 1;
111
112 if (rn < MULMID_TOOM42_THRESHOLD)
113 {
114 /* region not wide enough to make toom42 worthwhile for any portion */
115
116 TMP_DECL;
117
118 if (bn < CHUNK)
119 {
120 /* region not too tall either, just call basecase directly */
121 mpn_mulmid_basecase (rp, ap, an, bp, bn);
122 return;
123 }
124
125 /* Region quite tall. For better locality, use basecase on chunks:
126
127 AAAAA....
128 .AAAAA...
129 ..BBBBB..
130 ...BBBBB.
131 ....CCCCC
132 */
133
134 TMP_MARK;
135
136 temp = TMP_ALLOC_LIMBS (rn + 2);
137
138 /* first chunk (marked A in the above diagram) */
139 bp += bn - CHUNK, an -= bn - CHUNK;
140 mpn_mulmid_basecase (rp, ap, an, bp, CHUNK);
141
142 /* remaining chunks (B, C, etc) */
143 bn -= CHUNK;
144
145 while (bn >= CHUNK)
146 {
147 ap += CHUNK, bp -= CHUNK;
148 mpn_mulmid_basecase (temp, ap, an, bp, CHUNK);
149 mpn_add_n (rp, rp, temp, rn + 2);
150 bn -= CHUNK;
151 }
152
153 if (bn)
154 {
155 /* last remaining chunk */
156 ap += CHUNK, bp -= bn;
157 mpn_mulmid_basecase (temp, ap, rn + bn - 1, bp, bn);
158 mpn_add_n (rp, rp, temp, rn + 2);
159 }
160
161 TMP_FREE;
162 return;
163 }
164
165 /* we're definitely going to use toom42 somewhere */
166
167 if (bn > rn)
168 {
169 /* slice region into chunks, use toom42 on all chunks except possibly
170 the last:
171
172 AA....
173 .AA...
174 ..BB..
175 ...BB.
176 ....CC
177 */
178
179 TMP_DECL;
180 TMP_MARK;
181
182 temp = TMP_ALLOC_LIMBS (rn + 2 + mpn_toom42_mulmid_itch (rn));
183 scratch = temp + rn + 2;
184
185 /* first chunk (marked A in the above diagram) */
186 bp += bn - rn;
187 mpn_toom42_mulmid (rp, ap, bp, rn, scratch);
188
189 /* remaining chunks (B, C, etc) */
190 bn -= rn;
191
192 while (bn >= rn)
193 {
194 ap += rn, bp -= rn;
195 mpn_toom42_mulmid (temp, ap, bp, rn, scratch);
196 mpn_add_n (rp, rp, temp, rn + 2);
197 bn -= rn;
198 }
199
200 if (bn)
201 {
202 /* last remaining chunk */
203 ap += rn, bp -= bn;
204 mpn_mulmid (temp, ap, rn + bn - 1, bp, bn);
205 mpn_add_n (rp, rp, temp, rn + 2);
206 }
207
208 TMP_FREE;
209 }
210 else
211 {
212 /* slice region into chunks, use toom42 on all chunks except possibly
213 the last:
214
215 AAABBBCC..
216 .AAABBBCC.
217 ..AAABBBCC
218 */
219
220 TMP_DECL;
221 TMP_MARK;
222
223 scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (bn));
224
225 /* first chunk (marked A in the above diagram) */
226 mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
227
228 /* remaining chunks (B, C, etc) */
229 rn -= bn;
230
231 while (rn >= bn)
232 {
233 mp_limb_t t0, t1, cy;
234 ap += bn, rp += bn;
235 t0 = rp[0], t1 = rp[1];
236 mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
237 ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */
238 MPN_INCR_U (rp + 1, bn + 1, t1 + cy);
239 rn -= bn;
240 }
241
242 TMP_FREE;
243
244 if (rn)
245 {
246 /* last remaining chunk */
247 mp_limb_t t0, t1, cy;
248 ap += bn, rp += bn;
249 t0 = rp[0], t1 = rp[1];
250 mpn_mulmid (rp, ap, rn + bn - 1, bp, bn);
251 ADDC_LIMB (cy, rp[0], rp[0], t0);
252 MPN_INCR_U (rp + 1, rn + 1, t1 + cy);
253 }
254 }
255}