\let\mod=\bmod
@*Intro. This program finds all odd $n$-bit palindromes $x$ that are perfect
squares, using roughly $2^{n/4}$ steps of computation. Thus I hope to use it
for $n$ well over 100.
The idea is to try all $2^t$ combinations of the rightmost and leftmost
$t+3$ bits, for $t\approx n/4$, and to use number theory to rule out
the bad cases rather quickly.
(When $n=100$ I'll be using $t=22$.
This program is a big improvement over the one I wrote in 2013;
that one used $t=31$ when $n=100$, and $t\approx n/3$ in general.
Michael Coriand surprised me last week by claiming that he had a method
using only about $n/4$. At first I was mystified, baffled, stumped. But
aha, I woke up this morning with a good guess about what he'd discovered!
He asked me to doublecheck his results; and
I can't resist, even though I've got more than enough other things to do,
because it's fun to write useless code like~this.)
I haven't optimized this program for computational speed. My main goal was to
get it right, with my personal time minimized.
On the other hand I could easily have made it run a lot slower:
I didn't pass up some ``obvious'' ways to avoid redundant computations.
@d maxn 180 /* I could go a little higher, but there won't be time */
@c
#include
#include
int n; /* the length of palindromes sought */
unsigned long long y[maxn/2],r[maxn/2]; /* table of partial square roots */
unsigned long long q[maxn/4],qq[maxn/4]; /* table of partial modular sqrts */
unsigned long long pretrial[2],trial[3],acc[6]; /* multiplication workspace */
main(int argc,char* argv[]) {
register unsigned long long prod,sqrtxl,a,bit;
register int j,k,t,p,jj,kk;
@;
printf("Binary palindromic squares with %d bits:\n", n);
@;
for (a=0;a<1LL<;
}
@ @=
if (argc!=2 || sscanf(argv[1],"%d",&n)!=1) {
fprintf(stderr,"Usage: %s n\n",
argv[0]);
exit(-1);
}
if (n<15 || n>maxn) {
fprintf(stderr, "Sorry: n should be between 15 and %d.\n",
maxn);
exit(-2);
}
@ Here's the theory: Let $a_1a_2\ldots a_t$ be a binary string, and suppose
$$x\;=\;2^{n-1}+2^{n-4}a_1+2^{n-5}a_2+\cdots+2^{n-3-t}a_t+\cdots+
2^{t+2}a_t+\cdots+2^4a_2+2^3a_1+1\;=\;y^2.$$
Let $x_l=2^{n-1}+2^{n-4}a_1+2^{n-5}a_2+\cdots+2^{n-3-t}a_t$ and
$x_u=x_l+2^{n-3-t}$.
It's easy to prove by induction on $t$ that
there's a unique integer $q$ between 0 and $2^{t+2}$ such that
$q\mod4=1$ and $q^2\mod2^{t+3}=2^{t+2}a_t+\cdots+2^4a_2+2^3a_1+1$,
whenever $t>0$.
Hence the lower bits of the square root, $y\mod 2^{t+2}$,
must be either $q$ or $2^{t+2}-q$.
On the other hand $x_l=
for (j=1;j<=t;j++) q[j]=qq[j]=1;
@ @=
q[p]^=1LL<<(p+1);
qq[p]=q[p]*q[p];
for (j=p+1;j<=t;j++) {
if (qq[j-1]&(1LL<<(j+2))) q[j]=q[j-1]^(1LL<<(j+1));
else q[j]=q[j-1];
qq[j]=q[j]*q[j];
}
@ Similarly, we'll have to compute $2^t$ approximate square roots
for the leading bits of~$y$. Let |y[j]| be bits $m$ through~$j$
of $\sqrt{x_l}$, where $m=\lfloor n/2\rfloor-1$ is the index
of the leading bit. The classical algorithm for square root extraction
tells us how to go from |y[j]| to |y[j-1]|: We have a ``remainder''
|r[j]| representing the difference from the leading bits of~$x_l$
and $|y[j]|^2$, where $|r[j]|\le2|y[j]|$. To preserve this
invariant when $a_1\ldots a_t=0\ldots0$, we set |y[j-1]=2y[j]| and
|r[j-1]=4r[j]|; if then |r[j-1]>2y[j-1]| we subtract |2y[j-1]+1|
from |r[j-1]| and increase |y[j-1]| by~1. To preserve the invariant
for other values of $a_1\ldots a_t$, the same steps apply except
that $r[j-1]=4r[j]+2a_i+a_{i+1}$ for an appropriate value of~$i$.
The bits of the square root need only be computed for $j\ge t+2$;
therefore all computations fit easily into a single |long long| register.
Once again it's easy to prime the pump when $a_1\ldots a_t=0\ldots0$,
and to move to the successor by updating fewer than two
entries on average (plus roughly $n/8$ entries ``in the middle''
where $x_l$ has roughly $n/4$ zeros).
@=
if (n&1) {
y[(n-3)/2]=2,r[(n-3)/2]=0;
for (j=(n-5)/2;j>=t+2;j--) y[j]=2*y[j+1],r[j]=0;
}@+else {
y[n/2-1]=1,r[n/2-1]=1;
for (j=n/2-2;j>=t+2;j--) {
y[j]=2*y[j+1],r[j]=4*r[j+1];
if (r[j]>2*y[j]) r[j]-=2*y[j]+1,y[j]++;
}
}
@ @=
j=(n-3-p)/2;
if ((n+p)&1) r[j]+=1;
else r[j]=4*r[j+1]+2,y[j]=2*y[j+1];
if (r[j]>2*y[j]) r[j]-=2*y[j]+1,y[j]++;
for (j--;j>=t+2;j--) {
y[j]=2*y[j+1],r[j]=4*r[j+1];
if (r[j]>2*y[j]) r[j]-=2*y[j]+1,y[j]++;
}
@ Now comes the boring stuff. I hope I don't mess up here.
To make the final test, I'll need to square a number of up to 90 bits.
I simply treat it as three 32-bit chunks, and multiply by the
textbook method.
@d m32 0xffffffff /* 32-bit mask */
@=
for (j=0;j<3;j++) {
prod=trial[j]*trial[0];
if (j) prod+=acc[j];
acc[j]=prod&m32;
prod>>=32;
prod+=trial[j]*trial[1];
if (j) prod+=acc[j+1];
acc[j+1]=prod&m32;
prod>>=32;
prod+=trial[j]*trial[2];
if (j) prod+=acc[j+2];
acc[j+2]=prod&m32;
acc[j+3]=prod>>32;
}
@ To manufacture the |trial|, I need to shift the leading digits
appropriately and combine them with the trailing digits.
First, I put the leading digits into |pretrial| and |trial|.
(This can be tricky: If $n=129$ or 130, so that $t=29$, there
are 34 leading digits; one of them will go into |trial[0]|,
32 into |trial[1]|, and one into |trial[2]|.)
@=
if (t+2<32) pretrial[0]=(sqrtxl<<(t+2))&m32,
pretrial[1]=(sqrtxl>>(30-t))&m32;
else pretrial[0]=0,pretrial[1]=(sqrtxl<<(t-30))&m32;
trial[2]=sqrtxl>>(62-t);
@ @=
if (t+2<=32) trial[0]=pretrial[0]+q[t],trial[1]=pretrial[1];
else trial[0]=q[t]&m32, trial[1]=pretrial[1]+(q[t]>>32);
@ @d comp(x) ((1LL<<(t+2))-(x))
@=
if (t+2<=32) trial[0]=pretrial[0]+comp(q[t]),trial[1]=pretrial[1];
else trial[0]=comp(q[t])&m32, trial[1]=pretrial[1]+(comp(q[t])>>32);
@ I make $t=\lfloor(n-11)/4\rfloor$. (It will be between 1 and 42.)
@=
t=(n-11)/4;
@;
@;
@ And now at last the denouement, where we put everything together.
@=
{
sqrtxl=y[t+2];
for (p=t,bit=1;a&bit;p--,bit<<=1) ;
@;
if (y[t+2]>=sqrtxl+4)
fprintf(stderr,"Something's wrong in case %llx!\n",
a);
for (;sqrtxl<=y[t+2];sqrtxl++) {
@;
@;
@;
@;
@;
}
@
}
@ @=
@;
for (j=0,k=n-1;j>5]&(1<<(j&0x1f)))!=0);
kk=((acc[k>>5]&(1<<(k&0x1f)))!=0);
if (jj!=kk) break;
}
if (j>=k) /* solution! */
printf("%08llx%08llx%08llx^2=%08llx%08llx%08llx%08llx%08llx%08llx\n",
trial[2],trial[1],trial[0],
acc[5],acc[4],acc[3],acc[2],acc[1],acc[0]);
@*Index.