diff --git a/src/fmpz_vec/content_chained.c b/src/fmpz_vec/content_chained.c index 6572bcd1b5..2c9782788f 100644 --- a/src/fmpz_vec/content_chained.c +++ b/src/fmpz_vec/content_chained.c @@ -1,59 +1,498 @@ /* + Copyright 1994, 1996, 2000, 2001, 2009, 2012, 2019 Free Software Foundation, Inc. Copyright (C) 2020 Daniel Schultz Copyright (C) 2021 Fredrik Johansson + Copyright (C) 2024 Albin Ahlbäck This file is part of FLINT. + Contains code from mpn_gcd_1 in GMP 6.3.0. + FLINT is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License (LGPL) as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. See . */ +#include "mpn_extras.h" #include "fmpz.h" #include "fmpz_vec.h" +/* Calculating the GCD is a very costly operation. + + Hence, our attempt is to find a single-limbed integer of which we can start + the process with, because calculating the GCD of {u, n} and {v, 1} is much + more effective than calculating if of two n-limbed integers. + + Initial strategy to find a single-limbed integer: + * `ip` will often be small -- start by checking its total size. + * `ip` may contain factors of two -- check if reducing it leads to a single + limb. + * `vp` may contain single-limbed integers at specific places -- imagine it + representing a polynomial; many polynomials are small at the ends. +*/ + +#define SIZ(x) ((x)->_mp_size) +#define PTR(x) ((x)->_mp_d) + +typedef struct +{ + mp_srcptr d; + ulong sz; +} +mpn_struct; + +typedef mpn_struct * mpn_ptr; + +FLINT_STATIC_NOINLINE void heavy_machinery(fmpz_t, const fmpz *, slong, const fmpz_t); + +/* In this function we only take care of small inputs. If none is found, we go + * to heavy_machinery instead. */ void -_fmpz_vec_content_chained(fmpz_t res, const fmpz * vec, slong len, const fmpz_t in) +_fmpz_vec_content_chained(fmpz_t rp, const fmpz * vp, slong vn, const fmpz_t ip) { - while (len > 0 && fmpz_is_zero(vec + 0)) + slong jx = -1, kx; + ulong gd; /* single limb gcd */ + int exp; + mpz_srcptr mp; + slong msz; + mp_srcptr md; + +#if FLINT_WANT_ASSERT + gd = 0; +#endif + + /* We will assume that vn >= 2 for everything that will come */ + if (vn < WORD(2)) { - len--; - vec++; + if (ip == NULL) + fmpz_abs(rp, vp + 0); + else + fmpz_gcd(rp, vp + 0, ip); + + return; } - while (len > 1 && fmpz_is_zero(vec + len - 1)) - len--; + /* Check if `ip` is a single limb */ + if (ip != NULL) + { + fmpz tip = *ip; /* Read-only */ - if (len == 0) + if (!COEFF_IS_MPZ(tip)) + { + if (tip == WORD(0)) + { + ip = NULL; + goto check_vp_may_be_zero; + } + + gd = FLINT_ABS(tip); + goto ip1; + } + else + { + mp = COEFF_TO_PTR(tip); + msz = FLINT_ABS(SIZ(mp)); + + if (msz == 1) + { + gd = PTR(mp)[0]; + goto ip1; + } + } + } + +check_vp_may_be_zero: /* Fast checks only. Result can be zero. */ + for (jx = 0; jx < vn; jx++) { - fmpz_abs(res, in); - return; + if (!fmpz_is_zero(vp + jx)) + goto check_vp_not_zero; + else if (!COEFF_IS_MPZ(vp[jx])) + goto found_small; } - if (len == 1) + if (ip == NULL) + fmpz_zero(rp); + else + fmpz_abs(rp, ip); + + return; + +check_vp_not_zero: /* Fast checks only. Result cannot be zero from this point onwards. */ + for (jx++; jx < vn; jx++) + if (!fmpz_is_zero(vp + jx) && !COEFF_IS_MPZ(vp[jx])) + goto found_small; + + /* Alright, bring it on */ + heavy_machinery(rp, vp, vn, ip); + return; + +found_small: /* found small inside vp */ + gd = FLINT_ABS(vp[jx]); + +reduce_gd: /* gd is set, but we need to reduce it */ + exp = flint_ctz(gd); + + /* Take care of ip before next step */ + if (ip != NULL) { - fmpz_gcd(res, vec + 0, in); - return; + slong tsz; + mp = COEFF_TO_PTR(*ip); /* ip cannot be small */ + tsz = msz = FLINT_ABS(SIZ(mp)); + md = PTR(mp); + + /* Remove trailing zero limbs */ + while (*md == UWORD(0)) + { + md++; + msz--; + } + + if (tsz != msz) + /* keep exp */; + else + exp = FLINT_MIN(exp, flint_ctz(md[0])); + + gd = mpn_gcd_1(md, msz, gd); + + if (gd == UWORD(1)) + goto gd_eq_1; } - if (fmpz_is_pm1(in) || fmpz_is_pm1(vec + 0) || fmpz_is_pm1(vec + len - 1)) + goto compute; + +ip1: /* We found ip to be small */ + /* gd is set but unreduced and exp is not set */ + FLINT_ASSERT(gd != UWORD(0)); + exp = flint_ctz(gd); + gd >>= exp; + + if (gd == UWORD(1)) + goto gd_eq_1; + +compute: /* gd is set and ip has been taken care of */ + FLINT_ASSERT(gd != UWORD(0)); + FLINT_ASSERT(gd != UWORD(1)); + for (kx = 0; kx < vn; kx++) { - fmpz_one(res); - return; + FLINT_ASSERT(gd & 1 == 1); + + /* If kx == jx, we have already taken care of this entry */ + if (fmpz_is_zero(vp + kx) || kx == jx) + continue; + + if (gd == UWORD(1)) + goto gd_eq_1; + + if (!COEFF_IS_MPZ(vp[kx])) + { + ulong td = FLINT_ABS(vp[kx]); + int c; + + c = flint_ctz(td); + td >>= c; + + exp = FLINT_MIN(exp, c); + + /* Make td bigger */ + if (td < gd) + FLINT_SWAP(ulong, td, gd); + + /* Now td >= gd */ + /* If much bigger, reduce via remainder first */ + if ((td >> 16) > gd) + { + td %= gd; + if (td == UWORD(0)) /* td was a multiple of gd */ + continue; + + c = flint_ctz(td); + td >>= c; + } + + gd = mpn_gcd_11(td, gd); + } + else + { + mp = COEFF_TO_PTR(vp[kx]); + msz = FLINT_ABS(SIZ(mp)); + md = PTR(mp); + + gd = mpn_gcd_1(md, msz, gd); + } + } + + goto fin; + +gd_eq_1: /* gd == 1, but we may need to check for common factors of 2 */ + FLINT_ASSERT(gd == UWORD(1)); + + if (exp == 0) + goto fin; + + for (; kx < vn; kx++) + { + if (fmpz_is_zero(vp + kx)) + continue; + + if (!COEFF_IS_MPZ(vp[kx])) + { + exp = FLINT_MIN(flint_ctz(FLINT_ABS(vp[kx])), exp); + + if (exp == 0) + goto fin; + } + else + { + mp = COEFF_TO_PTR(vp[kx]); + md = PTR(mp); + + if (*md == UWORD(0)) + continue; + + exp = FLINT_MIN(flint_ctz(*md), exp); + + if (exp == 0) + goto fin; + } + } + +fin: + FLINT_ASSERT(gd != UWORD(0)); + fmpz_set_ui(rp, gd << exp); + return; +} + +/* Everything is an mpz */ +FLINT_STATIC_NOINLINE +void heavy_machinery(fmpz_t rp, const fmpz * vp, slong vn, const fmpz_t ip) +{ + mpn_ptr mvp; + ulong limbs = UWORD_MAX, exp; + slong idx0, idx1; + ulong sz0 = UWORD_MAX, sz1 = UWORD_MAX; + + ulong a0; + + slong ix; + + if (vn < WORD(2)) + FLINT_UNREACHABLE; + + mvp = flint_malloc(sizeof(mpn_struct) * (vn + (ip != NULL))); + + /* Start by adding entries to mvp */ + if (ip != NULL) + { + mpz_srcptr mp; + mp_srcptr md; + ulong sz, tz; + + mp = COEFF_TO_PTR(*ip); + tz = sz = FLINT_ABS(SIZ(mp)); + md = PTR(mp); + + while (*md == UWORD(0)) + { + md++; + sz--; + } + + limbs = tz - sz; + mvp[0].d = md; + mvp[0].sz = sz; + mvp++; + + sz0 = sz; + idx0 = -1; } - fmpz_gcd3(res, vec + 0, vec + len - 1, in); - vec++; - len -= 2; + for (ix = 0, vp += vn - 1; ix < vn; vp--) + { + mpz_srcptr mp; + mp_srcptr md; + ulong sz, tz; - while (len >= 2 && !fmpz_is_one(res)) + if (fmpz_is_zero(vp)) + { + vn--; + continue; + } + + mp = COEFF_TO_PTR(*vp); + tz = sz = FLINT_ABS(SIZ(mp)); + md = PTR(mp); + + while (*md == UWORD(0)) + { + md++; + sz--; + } + + limbs = FLINT_MIN(limbs, tz - sz); + mvp[ix].d = md; + mvp[ix].sz = sz; + ix++; + + if (sz < sz0) + { + sz1 = sz0; + sz0 = sz; + idx1 = idx0; + idx0 = ix; + } + else if (sz < sz1) + { + sz1 = sz; + idx1 = ix; + } + } + + /* Check if we can fast-track the initial GCD */ + if (sz1 == UWORD(1)) { - fmpz_gcd3(res, vec + 0, vec + len - 1, res); - vec++; - len -= 2; + /* 1 = sz0 = sz1 */ + ulong b0; + ulong bexp; + + a0 = mvp[idx0].d[0]; + b0 = mvp[idx1].d[0]; + + exp = flint_ctz(a0); + bexp = flint_ctz(b0); + + a0 >>= exp; + b0 >>= bexp; + exp = FLINT_MIN(exp, bexp); + + if (b0 < a0) + FLINT_SWAP(ulong, a0, b0); + + /* Now b0 >= a0 */ + /* If much bigger, reduce via remainder first */ + if ((b0 >> 16) > a0) + { + b0 %= a0; + if (b0 == UWORD(0)) /* b0 was a multiple of a0 */ + goto G1; + + b0 >>= flint_ctz(b0); + } + + a0 = mpn_gcd_11(b0, a0); + + if (a0 == UWORD(1)) + goto a0_eq_1; + + goto G1; + } + else if (sz0 == UWORD(1)) + { + /* 1 = sz0 < sz1 */ + ulong bexp; + mp_srcptr bd; + ulong bn; + + a0 = mvp[idx0].d[0]; + bd = mvp[idx1].d; + bn = mvp[idx1].sz; + + exp = flint_ctz(a0); + bexp = flint_ctz(bd[0]); + + a0 >>= exp; + exp = FLINT_MIN(exp, bexp); + + a0 = mpn_gcd_1(bd, bn, a0); + + if (a0 == UWORD(1)) + goto a0_eq_1; + + goto G1; + } + else if (sz1 == UWORD(2)) + { + /* 2 = sz0 = sz1 */ + ulong a1, b0, b1; + ulong bexp; + mp_limb_pair_t a; + + a0 = mvp[idx0].d[0]; + a1 = mvp[idx0].d[1]; + b0 = mvp[idx1].d[0]; + b1 = mvp[idx1].d[1]; + + exp = flint_ctz(a0); + bexp = flint_ctz(b0); + + a0 = (a0 >> exp) | (a1 << (FLINT_BITS - exp)); + a1 = (a1 >> exp); + b0 = (b0 >> bexp) | (b1 << (FLINT_BITS - bexp)); + b1 = (b1 >> bexp); + + a = mpn_gcd_22(b1, b0, a1, a0); + a1 = a.m2; + a0 = a.m1; + + if (a1 == UWORD(0)) + { + if (a0 == UWORD(1)) + goto a0_eq_1; + else + goto G1; + } + + goto G2; + } + else if (sz0 == UWORD(2)) + { + /* 2 = sz0 < sz1 */ + /* Try to reduce `a` to a single limb */ + ulong a1; + a0 = mvp[idx0].d[0]; + a1 = mvp[idx0].d[1]; + + exp = flint_ctz(a0); + + a0 = (a0 >> exp) | (a1 << (FLINT_BITS - exp)); + a1 = (a1 >> exp); + + if (a1 == UWORD(0)) + { + exp = FLINT_MIN(flint_ctz(mvp[idx1].d[0]), exp); + a0 = mpn_gcd_1(mvp[idx1].d, mvp[idx1].sz, a0); + + if (a0 == UWORD(1)) + goto a0_eq_1; + else + goto G1; + } + else + goto G2; } - if (len != 0 && !fmpz_is_one(res)) - fmpz_gcd(res, res, vec + 0); +Gn: /* TODO: + 0. Allocate temporary space for two integers -- one for the output (say + OP) and one for the input (say IP), + 1. Take the smallest sample we have (given at idx0), and right shift it + into OP, + 2. Take the next smallest sample and right shift it into IP, + 3. Compute OP <- GCD(IP, OP), + 4. If we are at the end, exit. + 5. If OP can fit in a limb, go to G1. + 6. If OP can fit in two limbs, go to G2. + 7. Take the next sample (no specific order), right shift it into IP and + go to step 3. */ + +G2: /* TODO: Do mpn_divrem_2 and continue with mpn_gcd_22 */ + +G1: for (ix = -1; ix < vn; ix++) + ; + +a0_eq_1: /* a0 == 1, but we may need to check for common factors of 2 */ + FLINT_ASSERT(a0 == UWORD(1)); + +end: + + flint_free(mvp - (ip != NULL)); } diff --git a/src/mpn_extras.h b/src/mpn_extras.h index adef8c0701..68aae685d3 100644 --- a/src/mpn_extras.h +++ b/src/mpn_extras.h @@ -785,9 +785,13 @@ int flint_mpn_factor_trial_tree(slong * factors, mp_srcptr x, mp_size_t xsize, s /* greatest common divisor ***************************************************/ +#define mpn_gcd_22 __gmpn_gcd_22 + mp_size_t flint_mpn_gcd_full2(mp_ptr gp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr scr); mp_size_t flint_mpn_gcd_full(mp_ptr gp, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn); +mp_limb_pair_t mpn_gcd_22(mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t); + /* modular arithmetic ********************************************************/ void flint_mpn_mulmod_preinv1(mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_srcptr d, mp_limb_t dinv, ulong norm);