feat(website): expand adopters list after full scan, harden discovery tool

Runs the refreshed find-adopters tool against every public Taskfile on
GitHub (1190 unique repos, 13 min) and uses the findings to swap in four
higher-signal entries: Azure/Azure-Sentinel replaces Microsoft's niche
Fabric provider, flet-dev/flet (16k stars, #1 non-Task hit), Anthropic's
Rust protobuf, and charmbracelet/glamour join the list. Gogs drops out
since it no longer surfaces in the best-match slice GitHub exposes.

Rewrites the discovery strategy: GitHub Code Search caps at 1000 results
per query and its size: qualifier turned out unreliable (non-monotone
total_count, sporadic 404s), so the tool now paginates each of the four
Taskfile variants to the cap and supplements with an org: scan over
~100 curated organizations. That's the practical ceiling without GH
Archive or BigQuery, and it captures every big-brand hit we care about.
Also drops the code-search rate from 24 to 8.5 req/min to match the
real 10 req/min authenticated limit.
This commit is contained in:
Valentin Maerten
2026-04-19 14:01:12 +02:00
parent cb7b4dc235
commit 4bee0c6d66
3 changed files with 153 additions and 74 deletions

View File

@@ -11,16 +11,16 @@ export const adopters: Adopter[] = [
url: 'https://github.com/docker/mcp-registry',
img: 'https://github.com/docker.png'
},
{
name: 'Microsoft',
url: 'https://github.com/Azure/Azure-Sentinel',
img: 'https://github.com/microsoft.png'
},
{
name: 'HashiCorp',
url: 'https://github.com/hashicorp/terraform-aws-terraform-enterprise-hvd',
img: 'https://github.com/hashicorp.png'
},
{
name: 'Microsoft',
url: 'https://github.com/microsoft/terraform-provider-fabric',
img: 'https://github.com/microsoft.png'
},
{
name: 'Vercel',
url: 'https://github.com/vercel/terraform-provider-vercel',
@@ -36,32 +36,42 @@ export const adopters: Adopter[] = [
url: 'https://github.com/aws-samples/appmod-blueprints',
img: 'https://github.com/aws-samples.png'
},
{
name: 'Anthropic',
url: 'https://github.com/anthropics/buffa',
img: 'https://github.com/anthropics.png'
},
// Notable open source projects
{
name: 'Arduino CLI',
url: 'https://github.com/arduino/arduino-cli',
img: 'https://github.com/arduino.png'
name: 'Flet',
url: 'https://github.com/flet-dev/flet',
img: 'https://github.com/flet-dev.png'
},
{
name: 'GoReleaser',
url: 'https://github.com/goreleaser/goreleaser',
img: 'https://github.com/goreleaser.png'
},
{
name: 'Arduino CLI',
url: 'https://github.com/arduino/arduino-cli',
img: 'https://github.com/arduino.png'
},
{
name: 'FerretDB',
url: 'https://github.com/FerretDB/FerretDB',
img: 'https://github.com/FerretDB.png'
},
{
name: 'Gogs',
url: 'https://github.com/gogs/gogs',
img: 'https://github.com/gogs.png'
},
{
name: 'Tyk',
url: 'https://github.com/TykTechnologies/tyk',
img: 'https://github.com/TykTechnologies.png'
},
{
name: 'Charmbracelet',
url: 'https://github.com/charmbracelet/glamour',
img: 'https://github.com/charmbracelet.png'
},
{
name: 'Outline',
url: 'https://github.com/OutlineFoundation/outline-server',

View File

@@ -6,14 +6,33 @@ candidates for the [taskfile.dev](https://taskfile.dev) "Used by" section.
## How it works
GitHub Code Search caps results at 1000 per query. `find-adopters` partitions
queries by **star bucket** (and, for the 0-star bucket, by pushed-year) so
every sub-query stays under the cap. Each unique repo is then enriched via a
single batched GraphQL call (stars, description, owner type, language, topics)
and sorted by popularity.
GitHub Code Search caps at 1000 results per query and only accepts a narrow
set of qualifiers alongside `filename:` — notably `stars:`, `language:`, and
`pushed:` don't combine, and `size:` does but its `total_count` isn't monotone
as ranges shrink, which makes partitioning unreliable. So the tool takes a
pragmatic two-pronged approach:
The full scan typically takes 15-30 minutes, mostly spent respecting the
Code Search rate limit (30 req/min).
1. **Global best-match pagination** — paginate `filename:Taskfile.yml`,
`Taskfile.yaml`, `Taskfile.dist.yml`, and `Taskfile.dist.yaml` directly up
to the 1000-result cap. Captures the top ~900 best-ranked hits per variant.
2. **Per-org scan** — iterate a built-in list of ~100 well-known organizations
(hyperscalers, OSS vendors, DevOps platforms, etc.) with
`filename:Taskfile.yml org:<name>`. Captures every Taskfile inside those
orgs even when their repos don't rank in the global top.
The union is deduplicated and enriched via batched GraphQL calls (stars,
description, owner type, language, topics), then sorted by stars.
A full scan typically takes 15-25 minutes — about 120 Code Search calls at the
10 req/min authenticated rate limit, plus a handful of GraphQL batches.
### Coverage caveat
GitHub's hard 1000-result cap on the Code Search API means this tool cannot
enumerate every Taskfile on GitHub — only the best-ranked slice plus the
curated orgs. For truly exhaustive coverage, consider
[GH Archive](https://www.gharchive.org/) or the BigQuery public GitHub
dataset, which are out of scope here.
## Usage

View File

@@ -83,7 +83,7 @@ func newClient(token string, verbose bool) *client {
http: &http.Client{Timeout: 60 * time.Second},
token: token,
verbose: verbose,
searchGap: 2500 * time.Millisecond, // ~24 req/min, under the 30/min cap
searchGap: 7 * time.Second, // ~8.5 req/min, under the 10/min cap
}
}
@@ -187,16 +187,30 @@ func urlEscape(s string) string {
return b.String()
}
// paginateQuery pages through up to 1000 results for a single query. It
// returns early if the reported total_count > 1000 (so the caller can narrow).
func (c *client) paginateQuery(q string) (repos []string, total int, err error) {
for page := 1; page <= 10; page++ {
// paginateQuery pages through up to 1000 results. If total_count is above the
// cap reachable by pageLimit pages, it still paginates — callers that want to
// avoid wasted calls and subdivide instead should check total beforehand by
// passing pageLimit=1.
func (c *client) paginateQuery(q string, pageLimit int) (repos []string, total int, err error) {
first, err := c.searchCode(q, 1)
if err != nil {
return nil, 0, err
}
total = first.TotalCount
if total == 0 {
return nil, 0, nil
}
for _, it := range first.Items {
repos = append(repos, it.Repository.FullName)
}
pages := (total + 99) / 100
if pages > pageLimit {
pages = pageLimit
}
for page := 2; page <= pages; page++ {
sr, err := c.searchCode(q, page)
if err != nil {
return nil, 0, err
}
if page == 1 {
total = sr.TotalCount
return repos, total, err
}
for _, it := range sr.Items {
repos = append(repos, it.Repository.FullName)
@@ -208,63 +222,99 @@ func (c *client) paginateQuery(q string) (repos []string, total int, err error)
return repos, total, nil
}
// discover partitions the search space by star bucket (and pushed-year for the
// stars:0 bucket) so every sub-query stays below the 1000-result cap.
// GitHub Code Search caps at 1000 results per query and is unreliable with
// the `size:` qualifier (total_count is non-monotone as ranges shrink), so
// partitioning tricks don't work cleanly. We instead combine two strategies:
//
// 1. Paginate each Taskfile variant directly — gets ~900 top-ranked hits per
// variant (the "best match" slice GitHub surfaces).
// 2. Iterate a curated list of well-known organizations with an explicit
// `org:` qualifier — gets full coverage inside big brands even when their
// repos don't rank in the global top 900.
//
// The union is deduplicated and enriched via GraphQL.
// knownOrgs is a snapshot of organizations worth scanning explicitly. Adding
// here captures every Taskfile inside the org regardless of its global rank.
// Loosely ordered from most likely to least.
var knownOrgs = []string{
// Hyperscalers / clouds
"docker", "microsoft", "google", "GoogleCloudPlatform", "aws", "awslabs",
"aws-samples", "amazon-science", "Azure", "Azure-Samples",
// Infra / DevOps vendors
"hashicorp", "hashicorp-forge", "vercel", "cloudflare", "digitalocean",
"heroku", "JetBrains", "pulumi", "buildkite", "circleci", "dagger",
"temporalio", "encoredev", "argoproj", "fluxcd", "flux-framework",
// Dev tools / platforms
"netflix", "shopify", "airbnb", "uber", "lyft", "stripe", "github",
"gitlabhq", "atlassian", "RedHat", "RedHatOfficial", "openshift",
// Communication / consumer
"spotify", "slackapi", "discord", "figma", "linear", "twilio", "segmentio",
// Data / ML
"prisma", "supabase", "railwayapp", "superfly", "fly-apps", "planetscale",
"tailscale", "coder", "anthropics", "openai", "huggingface",
"pytorch", "tensorflow",
// Observability / CNCF
"grafana", "prometheus", "envoyproxy", "getsentry", "sentry", "cncf",
"helm", "istio", "linkerd", "traefik", "caddyserver",
// Frontend frameworks
"vitejs", "biomejs", "sveltejs", "vuejs", "reactjs", "astro", "nuxt",
// Databases
"mongodb-labs", "redis", "neo4j", "elastic", "influxdata", "timescale",
"clickhouse", "FerretDB",
// Go ecosystem / popular OSS
"goreleaser", "spf13", "urfave", "charmbracelet", "nodejs", "golang",
"rust-lang", "python", "apache", "etcd-io", "grpc", "arduino",
// Data eng
"dbt-labs", "astronomer", "prefecthq",
}
// discover walks every Taskfile variant with global pagination plus per-org
// scans, and returns unique owner/name pairs.
func (c *client) discover() (map[string]struct{}, error) {
uniq := make(map[string]struct{})
// Star buckets cover the whole range.
buckets := []string{
"stars:>=1000",
"stars:100..999",
"stars:10..99",
"stars:1..9",
"stars:0",
variants := []string{
"Taskfile.yml",
"Taskfile.yaml",
"Taskfile.dist.yml",
"Taskfile.dist.yaml",
}
extensions := []string{"Taskfile.yml", "Taskfile.yaml"}
for _, ext := range extensions {
for _, bucket := range buckets {
q := fmt.Sprintf("filename:%s %s", ext, bucket)
c.logf("query: %s", q)
repos, total, err := c.paginateQuery(q)
if err != nil {
fmt.Fprintf(os.Stderr, "warn: %v\n", err)
continue
}
c.logf(" total=%d collected=%d", total, len(repos))
for _, r := range repos {
uniq[r] = struct{}{}
}
if total > 1000 {
// Partition the bucket further by pushed-year.
c.logf(" bucket exceeds 1000, subdividing by pushed-year")
if err := c.discoverByPushedYear(ext, bucket, uniq); err != nil {
fmt.Fprintf(os.Stderr, "warn: pushed partition: %v\n", err)
}
}
}
}
return uniq, nil
}
func (c *client) discoverByPushedYear(ext, bucket string, uniq map[string]struct{}) error {
now := time.Now().Year()
// GitHub Search was introduced in 2008 — years before that won't have hits.
for year := 2015; year <= now; year++ {
q := fmt.Sprintf("filename:%s %s pushed:%d-01-01..%d-12-31", ext, bucket, year, year)
c.logf(" sub-query: %s", q)
repos, total, err := c.paginateQuery(q)
c.logf("phase: global pagination (best-match top ~900 per variant)")
for _, v := range variants {
q := fmt.Sprintf("filename:%s", v)
c.logf(" query: %s", q)
repos, total, err := c.paginateQuery(q, 10)
if err != nil {
return err
fmt.Fprintf(os.Stderr, "warn: variant %s: %v\n", v, err)
continue
}
c.logf(" total=%d collected=%d", total, len(repos))
for _, r := range repos {
uniq[r] = struct{}{}
}
}
return nil
c.logf("phase: per-org scan (%d orgs)", len(knownOrgs))
for _, org := range knownOrgs {
q := fmt.Sprintf("filename:Taskfile.yml org:%s", org)
repos, total, err := c.paginateQuery(q, 10)
if err != nil {
// Orgs that don't exist return 404 — log once and move on.
c.logf(" skip %s: %v", org, err)
continue
}
if total == 0 {
continue
}
c.logf(" org:%s total=%d collected=%d", org, total, len(repos))
for _, r := range repos {
uniq[r] = struct{}{}
}
}
return uniq, nil
}
// ----- Enrichment (GraphQL) -----