\let\mod=\bmod @*Intro. This program finds all odd $n$-bit palindromes $x$ that are perfect squares, using roughly $2^{n/4}$ steps of computation. Thus I hope to use it for $n$ well over 100. The idea is to try all $2^t$ combinations of the rightmost and leftmost $t+3$ bits, for $t\approx n/4$, and to use number theory to rule out the bad cases rather quickly. (When $n=100$ I'll be using $t=22$. This program is a big improvement over the one I wrote in 2013; that one used $t=31$ when $n=100$, and $t\approx n/3$ in general. Michael Coriand surprised me last week by claiming that he had a method using only about $n/4$. At first I was mystified, baffled, stumped. But aha, I woke up this morning with a good guess about what he'd discovered! He asked me to doublecheck his results; and I can't resist, even though I've got more than enough other things to do, because it's fun to write useless code like~this.) I haven't optimized this program for computational speed. My main goal was to get it right, with my personal time minimized. On the other hand I could easily have made it run a lot slower: I didn't pass up some ``obvious'' ways to avoid redundant computations. @d maxn 180 /* I could go a little higher, but there won't be time */ @c #include #include int n; /* the length of palindromes sought */ unsigned long long y[maxn/2],r[maxn/2]; /* table of partial square roots */ unsigned long long q[maxn/4],qq[maxn/4]; /* table of partial modular sqrts */ unsigned long long pretrial[2],trial[3],acc[6]; /* multiplication workspace */ main(int argc,char* argv[]) { register unsigned long long prod,sqrtxl,a,bit; register int j,k,t,p,jj,kk; @; printf("Binary palindromic squares with %d bits:\n", n); @; for (a=0;a<1LL<; } @ @= if (argc!=2 || sscanf(argv[1],"%d",&n)!=1) { fprintf(stderr,"Usage: %s n\n", argv[0]); exit(-1); } if (n<15 || n>maxn) { fprintf(stderr, "Sorry: n should be between 15 and %d.\n", maxn); exit(-2); } @ Here's the theory: Let $a_1a_2\ldots a_t$ be a binary string, and suppose $$x\;=\;2^{n-1}+2^{n-4}a_1+2^{n-5}a_2+\cdots+2^{n-3-t}a_t+\cdots+ 2^{t+2}a_t+\cdots+2^4a_2+2^3a_1+1\;=\;y^2.$$ Let $x_l=2^{n-1}+2^{n-4}a_1+2^{n-5}a_2+\cdots+2^{n-3-t}a_t$ and $x_u=x_l+2^{n-3-t}$. It's easy to prove by induction on $t$ that there's a unique integer $q$ between 0 and $2^{t+2}$ such that $q\mod4=1$ and $q^2\mod2^{t+3}=2^{t+2}a_t+\cdots+2^4a_2+2^3a_1+1$, whenever $t>0$. Hence the lower bits of the square root, $y\mod 2^{t+2}$, must be either $q$ or $2^{t+2}-q$. On the other hand $x_l= for (j=1;j<=t;j++) q[j]=qq[j]=1; @ @= q[p]^=1LL<<(p+1); qq[p]=q[p]*q[p]; for (j=p+1;j<=t;j++) { if (qq[j-1]&(1LL<<(j+2))) q[j]=q[j-1]^(1LL<<(j+1)); else q[j]=q[j-1]; qq[j]=q[j]*q[j]; } @ Similarly, we'll have to compute $2^t$ approximate square roots for the leading bits of~$y$. Let |y[j]| be bits $m$ through~$j$ of $\sqrt{x_l}$, where $m=\lfloor n/2\rfloor-1$ is the index of the leading bit. The classical algorithm for square root extraction tells us how to go from |y[j]| to |y[j-1]|: We have a ``remainder'' |r[j]| representing the difference from the leading bits of~$x_l$ and $|y[j]|^2$, where $|r[j]|\le2|y[j]|$. To preserve this invariant when $a_1\ldots a_t=0\ldots0$, we set |y[j-1]=2y[j]| and |r[j-1]=4r[j]|; if then |r[j-1]>2y[j-1]| we subtract |2y[j-1]+1| from |r[j-1]| and increase |y[j-1]| by~1. To preserve the invariant for other values of $a_1\ldots a_t$, the same steps apply except that $r[j-1]=4r[j]+2a_i+a_{i+1}$ for an appropriate value of~$i$. The bits of the square root need only be computed for $j\ge t+2$; therefore all computations fit easily into a single |long long| register. Once again it's easy to prime the pump when $a_1\ldots a_t=0\ldots0$, and to move to the successor by updating fewer than two entries on average (plus roughly $n/8$ entries ``in the middle'' where $x_l$ has roughly $n/4$ zeros). @= if (n&1) { y[(n-3)/2]=2,r[(n-3)/2]=0; for (j=(n-5)/2;j>=t+2;j--) y[j]=2*y[j+1],r[j]=0; }@+else { y[n/2-1]=1,r[n/2-1]=1; for (j=n/2-2;j>=t+2;j--) { y[j]=2*y[j+1],r[j]=4*r[j+1]; if (r[j]>2*y[j]) r[j]-=2*y[j]+1,y[j]++; } } @ @= j=(n-3-p)/2; if ((n+p)&1) r[j]+=1; else r[j]=4*r[j+1]+2,y[j]=2*y[j+1]; if (r[j]>2*y[j]) r[j]-=2*y[j]+1,y[j]++; for (j--;j>=t+2;j--) { y[j]=2*y[j+1],r[j]=4*r[j+1]; if (r[j]>2*y[j]) r[j]-=2*y[j]+1,y[j]++; } @ Now comes the boring stuff. I hope I don't mess up here. To make the final test, I'll need to square a number of up to 90 bits. I simply treat it as three 32-bit chunks, and multiply by the textbook method. @d m32 0xffffffff /* 32-bit mask */ @= for (j=0;j<3;j++) { prod=trial[j]*trial[0]; if (j) prod+=acc[j]; acc[j]=prod&m32; prod>>=32; prod+=trial[j]*trial[1]; if (j) prod+=acc[j+1]; acc[j+1]=prod&m32; prod>>=32; prod+=trial[j]*trial[2]; if (j) prod+=acc[j+2]; acc[j+2]=prod&m32; acc[j+3]=prod>>32; } @ To manufacture the |trial|, I need to shift the leading digits appropriately and combine them with the trailing digits. First, I put the leading digits into |pretrial| and |trial|. (This can be tricky: If $n=129$ or 130, so that $t=29$, there are 34 leading digits; one of them will go into |trial[0]|, 32 into |trial[1]|, and one into |trial[2]|.) @= if (t+2<32) pretrial[0]=(sqrtxl<<(t+2))&m32, pretrial[1]=(sqrtxl>>(30-t))&m32; else pretrial[0]=0,pretrial[1]=(sqrtxl<<(t-30))&m32; trial[2]=sqrtxl>>(62-t); @ @= if (t+2<=32) trial[0]=pretrial[0]+q[t],trial[1]=pretrial[1]; else trial[0]=q[t]&m32, trial[1]=pretrial[1]+(q[t]>>32); @ @d comp(x) ((1LL<<(t+2))-(x)) @= if (t+2<=32) trial[0]=pretrial[0]+comp(q[t]),trial[1]=pretrial[1]; else trial[0]=comp(q[t])&m32, trial[1]=pretrial[1]+(comp(q[t])>>32); @ I make $t=\lfloor(n-11)/4\rfloor$. (It will be between 1 and 42.) @= t=(n-11)/4; @; @; @ And now at last the denouement, where we put everything together. @= { sqrtxl=y[t+2]; for (p=t,bit=1;a&bit;p--,bit<<=1) ; @; if (y[t+2]>=sqrtxl+4) fprintf(stderr,"Something's wrong in case %llx!\n", a); for (;sqrtxl<=y[t+2];sqrtxl++) { @; @; @; @; @; } @ } @ @= @; for (j=0,k=n-1;j>5]&(1<<(j&0x1f)))!=0); kk=((acc[k>>5]&(1<<(k&0x1f)))!=0); if (jj!=kk) break; } if (j>=k) /* solution! */ printf("%08llx%08llx%08llx^2=%08llx%08llx%08llx%08llx%08llx%08llx\n", trial[2],trial[1],trial[0], acc[5],acc[4],acc[3],acc[2],acc[1],acc[0]); @*Index.