feat(website): expand adopters list after full scan, harden discovery tool

Runs the refreshed find-adopters tool against every public Taskfile on GitHub (1190 unique repos, 13 min) and uses the findings to swap in four higher-signal entries: Azure/Azure-Sentinel replaces Microsoft's niche Fabric provider, flet-dev/flet (16k stars, #1 non-Task hit), Anthropic's Rust protobuf, and charmbracelet/glamour join the list. Gogs drops out since it no longer surfaces in the best-match slice GitHub exposes. Rewrites the discovery strategy: GitHub Code Search caps at 1000 results per query and its size: qualifier turned out unreliable (non-monotone total_count, sporadic 404s), so the tool now paginates each of the four Taskfile variants to the cap and supplements with an org: scan over ~100 curated organizations. That's the practical ceiling without GH Archive or BigQuery, and it captures every big-brand hit we care about. Also drops the code-search rate from 24 to 8.5 req/min to match the real 10 req/min authenticated limit.
2026-06-11 09:51:50 +00:00 · 2026-04-19 14:01:12 +02:00
parent cb7b4dc235
commit 4bee0c6d66
3 changed files with 153 additions and 74 deletions
--- a/website/.vitepress/adopters.ts
+++ b/website/.vitepress/adopters.ts
@@ -11,16 +11,16 @@ export const adopters: Adopter[] = [
    url: 'https://github.com/docker/mcp-registry',
    img: 'https://github.com/docker.png'
  },
+  {
+    name: 'Microsoft',
+    url: 'https://github.com/Azure/Azure-Sentinel',
+    img: 'https://github.com/microsoft.png'
+  },
  {
    name: 'HashiCorp',
    url: 'https://github.com/hashicorp/terraform-aws-terraform-enterprise-hvd',
    img: 'https://github.com/hashicorp.png'
  },
-  {
-    name: 'Microsoft',
-    url: 'https://github.com/microsoft/terraform-provider-fabric',
-    img: 'https://github.com/microsoft.png'
-  },
  {
    name: 'Vercel',
    url: 'https://github.com/vercel/terraform-provider-vercel',
@@ -36,32 +36,42 @@ export const adopters: Adopter[] = [
    url: 'https://github.com/aws-samples/appmod-blueprints',
    img: 'https://github.com/aws-samples.png'
  },
+  {
+    name: 'Anthropic',
+    url: 'https://github.com/anthropics/buffa',
+    img: 'https://github.com/anthropics.png'
+  },
  // Notable open source projects
  {
-    name: 'Arduino CLI',
-    url: 'https://github.com/arduino/arduino-cli',
-    img: 'https://github.com/arduino.png'
+    name: 'Flet',
+    url: 'https://github.com/flet-dev/flet',
+    img: 'https://github.com/flet-dev.png'
  },
  {
    name: 'GoReleaser',
    url: 'https://github.com/goreleaser/goreleaser',
    img: 'https://github.com/goreleaser.png'
  },
+  {
+    name: 'Arduino CLI',
+    url: 'https://github.com/arduino/arduino-cli',
+    img: 'https://github.com/arduino.png'
+  },
  {
    name: 'FerretDB',
    url: 'https://github.com/FerretDB/FerretDB',
    img: 'https://github.com/FerretDB.png'
  },
-  {
-    name: 'Gogs',
-    url: 'https://github.com/gogs/gogs',
-    img: 'https://github.com/gogs.png'
-  },
  {
    name: 'Tyk',
    url: 'https://github.com/TykTechnologies/tyk',
    img: 'https://github.com/TykTechnologies.png'
  },
+  {
+    name: 'Charmbracelet',
+    url: 'https://github.com/charmbracelet/glamour',
+    img: 'https://github.com/charmbracelet.png'
+  },
  {
    name: 'Outline',
    url: 'https://github.com/OutlineFoundation/outline-server',
--- a/website/scripts/find-adopters/README.md
+++ b/website/scripts/find-adopters/README.md
@@ -6,14 +6,33 @@ candidates for the [taskfile.dev](https://taskfile.dev) "Used by" section.

 ## How it works

-GitHub Code Search caps results at 1000 per query. `find-adopters` partitions
-queries by **star bucket** (and, for the 0-star bucket, by pushed-year) so
-every sub-query stays under the cap. Each unique repo is then enriched via a
-single batched GraphQL call (stars, description, owner type, language, topics)
-and sorted by popularity.
+GitHub Code Search caps at 1000 results per query and only accepts a narrow
+set of qualifiers alongside `filename:` — notably `stars:`, `language:`, and
+`pushed:` don't combine, and `size:` does but its `total_count` isn't monotone
+as ranges shrink, which makes partitioning unreliable. So the tool takes a
+pragmatic two-pronged approach:

-The full scan typically takes 15-30 minutes, mostly spent respecting the
-Code Search rate limit (30 req/min).
+1. **Global best-match pagination** — paginate `filename:Taskfile.yml`,
+   `Taskfile.yaml`, `Taskfile.dist.yml`, and `Taskfile.dist.yaml` directly up
+   to the 1000-result cap. Captures the top ~900 best-ranked hits per variant.
+2. **Per-org scan** — iterate a built-in list of ~100 well-known organizations
+   (hyperscalers, OSS vendors, DevOps platforms, etc.) with
+   `filename:Taskfile.yml org:<name>`. Captures every Taskfile inside those
+   orgs even when their repos don't rank in the global top.
+
+The union is deduplicated and enriched via batched GraphQL calls (stars,
+description, owner type, language, topics), then sorted by stars.
+
+A full scan typically takes 15-25 minutes — about 120 Code Search calls at the
+10 req/min authenticated rate limit, plus a handful of GraphQL batches.
+
+### Coverage caveat
+
+GitHub's hard 1000-result cap on the Code Search API means this tool cannot
+enumerate every Taskfile on GitHub — only the best-ranked slice plus the
+curated orgs. For truly exhaustive coverage, consider
+[GH Archive](https://www.gharchive.org/) or the BigQuery public GitHub
+dataset, which are out of scope here.

 ## Usage

--- a/website/scripts/find-adopters/main.go
+++ b/website/scripts/find-adopters/main.go
@@ -83,7 +83,7 @@ func newClient(token string, verbose bool) *client {
 		http:      &http.Client{Timeout: 60 * time.Second},
 		token:     token,
 		verbose:   verbose,
-		searchGap: 2500 * time.Millisecond, // ~24 req/min, under the 30/min cap
+		searchGap: 7 * time.Second, // ~8.5 req/min, under the 10/min cap
 	}
 }

@@ -187,16 +187,30 @@ func urlEscape(s string) string {
 	return b.String()
 }

-// paginateQuery pages through up to 1000 results for a single query. It
-// returns early if the reported total_count > 1000 (so the caller can narrow).
-func (c *client) paginateQuery(q string) (repos []string, total int, err error) {
-	for page := 1; page <= 10; page++ {
+// paginateQuery pages through up to 1000 results. If total_count is above the
+// cap reachable by pageLimit pages, it still paginates — callers that want to
+// avoid wasted calls and subdivide instead should check total beforehand by
+// passing pageLimit=1.
+func (c *client) paginateQuery(q string, pageLimit int) (repos []string, total int, err error) {
+	first, err := c.searchCode(q, 1)
+	if err != nil {
+		return nil, 0, err
+	}
+	total = first.TotalCount
+	if total == 0 {
+		return nil, 0, nil
+	}
+	for _, it := range first.Items {
+		repos = append(repos, it.Repository.FullName)
+	}
+	pages := (total + 99) / 100
+	if pages > pageLimit {
+		pages = pageLimit
+	}
+	for page := 2; page <= pages; page++ {
 		sr, err := c.searchCode(q, page)
 		if err != nil {
-			return nil, 0, err
-		}
-		if page == 1 {
-			total = sr.TotalCount
+			return repos, total, err
 		}
 		for _, it := range sr.Items {
 			repos = append(repos, it.Repository.FullName)
@@ -208,63 +222,99 @@ func (c *client) paginateQuery(q string) (repos []string, total int, err error)
 	return repos, total, nil
 }

-// discover partitions the search space by star bucket (and pushed-year for the
-// stars:0 bucket) so every sub-query stays below the 1000-result cap.
+// GitHub Code Search caps at 1000 results per query and is unreliable with
+// the `size:` qualifier (total_count is non-monotone as ranges shrink), so
+// partitioning tricks don't work cleanly. We instead combine two strategies:
+//
+//  1. Paginate each Taskfile variant directly — gets ~900 top-ranked hits per
+//     variant (the "best match" slice GitHub surfaces).
+//  2. Iterate a curated list of well-known organizations with an explicit
+//     `org:` qualifier — gets full coverage inside big brands even when their
+//     repos don't rank in the global top 900.
+//
+// The union is deduplicated and enriched via GraphQL.
+
+// knownOrgs is a snapshot of organizations worth scanning explicitly. Adding
+// here captures every Taskfile inside the org regardless of its global rank.
+// Loosely ordered from most likely to least.
+var knownOrgs = []string{
+	// Hyperscalers / clouds
+	"docker", "microsoft", "google", "GoogleCloudPlatform", "aws", "awslabs",
+	"aws-samples", "amazon-science", "Azure", "Azure-Samples",
+	// Infra / DevOps vendors
+	"hashicorp", "hashicorp-forge", "vercel", "cloudflare", "digitalocean",
+	"heroku", "JetBrains", "pulumi", "buildkite", "circleci", "dagger",
+	"temporalio", "encoredev", "argoproj", "fluxcd", "flux-framework",
+	// Dev tools / platforms
+	"netflix", "shopify", "airbnb", "uber", "lyft", "stripe", "github",
+	"gitlabhq", "atlassian", "RedHat", "RedHatOfficial", "openshift",
+	// Communication / consumer
+	"spotify", "slackapi", "discord", "figma", "linear", "twilio", "segmentio",
+	// Data / ML
+	"prisma", "supabase", "railwayapp", "superfly", "fly-apps", "planetscale",
+	"tailscale", "coder", "anthropics", "openai", "huggingface",
+	"pytorch", "tensorflow",
+	// Observability / CNCF
+	"grafana", "prometheus", "envoyproxy", "getsentry", "sentry", "cncf",
+	"helm", "istio", "linkerd", "traefik", "caddyserver",
+	// Frontend frameworks
+	"vitejs", "biomejs", "sveltejs", "vuejs", "reactjs", "astro", "nuxt",
+	// Databases
+	"mongodb-labs", "redis", "neo4j", "elastic", "influxdata", "timescale",
+	"clickhouse", "FerretDB",
+	// Go ecosystem / popular OSS
+	"goreleaser", "spf13", "urfave", "charmbracelet", "nodejs", "golang",
+	"rust-lang", "python", "apache", "etcd-io", "grpc", "arduino",
+	// Data eng
+	"dbt-labs", "astronomer", "prefecthq",
+}
+
+// discover walks every Taskfile variant with global pagination plus per-org
+// scans, and returns unique owner/name pairs.
 func (c *client) discover() (map[string]struct{}, error) {
 	uniq := make(map[string]struct{})

-	// Star buckets cover the whole range.
-	buckets := []string{
-		"stars:>=1000",
-		"stars:100..999",
-		"stars:10..99",
-		"stars:1..9",
-		"stars:0",
+	variants := []string{
+		"Taskfile.yml",
+		"Taskfile.yaml",
+		"Taskfile.dist.yml",
+		"Taskfile.dist.yaml",
 	}

-	extensions := []string{"Taskfile.yml", "Taskfile.yaml"}
-
-	for _, ext := range extensions {
-		for _, bucket := range buckets {
-			q := fmt.Sprintf("filename:%s %s", ext, bucket)
-			c.logf("query: %s", q)
-			repos, total, err := c.paginateQuery(q)
-			if err != nil {
-				fmt.Fprintf(os.Stderr, "warn: %v\n", err)
-				continue
-			}
-			c.logf("  total=%d collected=%d", total, len(repos))
-			for _, r := range repos {
-				uniq[r] = struct{}{}
-			}
-			if total > 1000 {
-				// Partition the bucket further by pushed-year.
-				c.logf("  bucket exceeds 1000, subdividing by pushed-year")
-				if err := c.discoverByPushedYear(ext, bucket, uniq); err != nil {
-					fmt.Fprintf(os.Stderr, "warn: pushed partition: %v\n", err)
-				}
-			}
-		}
-	}
-	return uniq, nil
-}
-
-func (c *client) discoverByPushedYear(ext, bucket string, uniq map[string]struct{}) error {
-	now := time.Now().Year()
-	// GitHub Search was introduced in 2008 — years before that won't have hits.
-	for year := 2015; year <= now; year++ {
-		q := fmt.Sprintf("filename:%s %s pushed:%d-01-01..%d-12-31", ext, bucket, year, year)
-		c.logf("  sub-query: %s", q)
-		repos, total, err := c.paginateQuery(q)
+	c.logf("phase: global pagination (best-match top ~900 per variant)")
+	for _, v := range variants {
+		q := fmt.Sprintf("filename:%s", v)
+		c.logf("  query: %s", q)
+		repos, total, err := c.paginateQuery(q, 10)
 		if err != nil {
-			return err
+			fmt.Fprintf(os.Stderr, "warn: variant %s: %v\n", v, err)
+			continue
 		}
 		c.logf("    total=%d collected=%d", total, len(repos))
 		for _, r := range repos {
 			uniq[r] = struct{}{}
 		}
 	}
-	return nil
+
+	c.logf("phase: per-org scan (%d orgs)", len(knownOrgs))
+	for _, org := range knownOrgs {
+		q := fmt.Sprintf("filename:Taskfile.yml org:%s", org)
+		repos, total, err := c.paginateQuery(q, 10)
+		if err != nil {
+			// Orgs that don't exist return 404 — log once and move on.
+			c.logf("  skip %s: %v", org, err)
+			continue
+		}
+		if total == 0 {
+			continue
+		}
+		c.logf("  org:%s total=%d collected=%d", org, total, len(repos))
+		for _, r := range repos {
+			uniq[r] = struct{}{}
+		}
+	}
+
+	return uniq, nil
 }

 // ----- Enrichment (GraphQL) -----