From ca0b3d080d10a542521239ee66ac1ff45fdf4361 Mon Sep 17 00:00:00 2001 From: Peter Muehlbacher Date: Wed, 18 Feb 2026 10:39:00 +0000 Subject: [PATCH] Fix merge parameter descriptions to prevent CC misuse Co-Authored-By: Claude Opus 4.6 --- .claude-plugin/plugin.json | 2 +- docs/mcp-server.md | 11 ++++---- docs/reference/MERGE.md | 30 ++++++++++++--------- everyrow-mcp/README.md | 16 +++++------ everyrow-mcp/src/everyrow_mcp/server.py | 36 +++++++++++++++++++------ skills/everyrow-sdk/SKILL.md | 24 +++++++++-------- src/everyrow/ops.py | 14 +++++----- 7 files changed, 80 insertions(+), 53 deletions(-) diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 12389519..98dc4eb0 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "everyrow", "description": "Claude Code plugin for the everyrow SDK - AI-powered data processing utilities for transforming, deduping, merging, ranking, and screening dataframes", - "version": "0.3.1", + "version": "0.3.3", "author": { "name": "FutureSearch" }, diff --git a/docs/mcp-server.md b/docs/mcp-server.md index 32163f2e..a827eb12 100644 --- a/docs/mcp-server.md +++ b/docs/mcp-server.md @@ -51,15 +51,16 @@ Returns `task_id` and `session_url`. Call `everyrow_progress` to monitor. ### everyrow_merge -Join two CSVs using intelligent entity matching. +Join two CSVs using intelligent entity matching (LEFT JOIN semantics). | Parameter | Type | Required | Description | |-----------|------|----------|-------------| | `task` | string | Yes | How to match rows between tables. | -| `left_csv` | string | Yes | Absolute path to primary CSV. | -| `right_csv` | string | Yes | Absolute path to secondary CSV. | -| `merge_on_left` | string | No | Column in left table to match on. | -| `merge_on_right` | string | No | Column in right table to match on. | +| `left_csv` | string | Yes | The table being enriched — all its rows are kept in the output. | +| `right_csv` | string | Yes | The lookup/reference table — its columns are appended to matches; unmatched left rows get nulls. | +| `merge_on_left` | string | No | Only set if you expect exact string matches on this column or want to draw agent attention to it. Fine to omit. | +| `merge_on_right` | string | No | Only set if you expect exact string matches on this column or want to draw agent attention to it. Fine to omit. | +| `relationship_type` | string | No | `many_to_one` (default) — multiple left rows can match one right row. `one_to_one` — only when both tables have unique entities of the same kind. | | `use_web_search` | string | No | `auto` (default), `yes`, or `no`. | Returns `task_id` and `session_url`. Call `everyrow_progress` to monitor. diff --git a/docs/reference/MERGE.md b/docs/reference/MERGE.md index 067023ee..1db2a38b 100644 --- a/docs/reference/MERGE.md +++ b/docs/reference/MERGE.md @@ -14,15 +14,17 @@ from everyrow.ops import merge result = await merge( task="Match each software product to its parent company", - left_table=software_products, - right_table=approved_vendors, - merge_on_left="product_name", - merge_on_right="company_name", + left_table=software_products, # table being enriched — all rows kept + right_table=approved_vendors, # lookup/reference table — columns appended + # merge_on_left/merge_on_right omitted: auto-detection handles most cases. + # Only specify them when you expect exact string matches on specific columns + # or want to draw agent attention to them. ) print(result.data.head()) ``` -For ambiguous cases, add context: +For ambiguous cases, add context. Here `merge_on_left`/`merge_on_right` are set because +the column names ("sponsor", "company") are too generic for auto-detection: ```python result = await merge( @@ -34,10 +36,10 @@ result = await merge( - Regional names (MSD is Merck outside the US) - Abbreviations (BMS → Bristol-Myers Squibb) """, - left_table=trials, - right_table=pharma_companies, - merge_on_left="sponsor", - merge_on_right="company", + left_table=trials, # table being enriched — all rows kept + right_table=pharma_companies, # lookup table + merge_on_left="sponsor", # specified: draws agent attention to this column + merge_on_right="company", # specified: draws agent attention to this column ) print(result.data.head()) ``` @@ -51,10 +53,12 @@ A DataFrame with all left table columns plus matched right table columns. Rows t | Name | Type | Description | |------|------|-------------| | `task` | str | How to match the tables | -| `left_table` | DataFrame | Primary table (all rows kept) | -| `right_table` | DataFrame | Table to match from | -| `merge_on_left` | Optional[str] | Column in left table. Model will try to guess if not specified. | -| `merge_on_right` | Optional[str] | Column in right table. Model will try to guess if not specified. | +| `left_table` | DataFrame | The table being enriched — all its rows are kept in the output (LEFT JOIN). | +| `right_table` | DataFrame | The lookup/reference table — its columns are appended to matches; unmatched left rows get nulls. | +| `merge_on_left` | Optional[str] | Only set if you expect exact string matches on this column or want to draw agent attention to it. Auto-detected if omitted. | +| `merge_on_right` | Optional[str] | Only set if you expect exact string matches on this column or want to draw agent attention to it. Auto-detected if omitted. | +| `relationship_type` | Optional[str] | `"many_to_one"` (default) — multiple left rows can match one right row. `"one_to_one"` — only when both tables have unique entities of the same kind. | +| `use_web_search` | Optional[str] | `"auto"` (default), `"yes"`, or `"no"`. Controls whether agents use web search to resolve matches. | | `session` | Session | Optional, auto-created if omitted | ## Performance diff --git a/everyrow-mcp/README.md b/everyrow-mcp/README.md index ea5da7c2..f0bfc223 100644 --- a/everyrow-mcp/README.md +++ b/everyrow-mcp/README.md @@ -103,20 +103,20 @@ Example: Dedupe contacts where "same person even with name abbreviations or care ### everyrow_merge -Join two CSV files using intelligent entity matching. +Join two CSV files using intelligent entity matching (LEFT JOIN semantics). ``` Parameters: - task: Natural language description of how to match rows -- left_csv: Absolute path to primary CSV -- right_csv: Absolute path to secondary CSV -- merge_on_left: (optional) Column name in left table -- merge_on_right: (optional) Column name in right table -- use_web_search: (optional) "auto", "yes", or "no" -- relationship_type: (optional) "many_to_one" (default) if multiple left rows can match one right row, or "one_to_one" matches must be unique +- left_csv: The table being enriched — all its rows are kept in the output +- right_csv: The lookup/reference table — its columns are appended to matches; unmatched left rows get nulls +- merge_on_left: (optional) Only set if you expect exact string matches on this column or want to draw agent attention to it. Fine to omit. +- merge_on_right: (optional) Only set if you expect exact string matches on this column or want to draw agent attention to it. Fine to omit. +- use_web_search: (optional) "auto" (default), "yes", or "no" +- relationship_type: (optional) "many_to_one" (default) — multiple left rows can match one right row. "one_to_one" — only when both tables have unique entities of the same kind. ``` -Example: Match software products to parent companies (Photoshop -> Adobe) +Example: Match software products (left, enriched) to parent companies (right, lookup): Photoshop -> Adobe ### everyrow_agent diff --git a/everyrow-mcp/src/everyrow_mcp/server.py b/everyrow-mcp/src/everyrow_mcp/server.py index 63502e8b..5c10f478 100644 --- a/everyrow-mcp/src/everyrow_mcp/server.py +++ b/everyrow-mcp/src/everyrow_mcp/server.py @@ -212,20 +212,28 @@ class MergeInput(BaseModel): description="Natural language description of how to match rows.", min_length=1, ) - left_csv: str = Field(..., description="Absolute path to the left/primary CSV.") - right_csv: str = Field(..., description="Absolute path to the right/secondary CSV.") + left_csv: str = Field( + ..., + description="Absolute path to the left CSV. Works like a LEFT JOIN: ALL rows from this table are kept in the output. This should be the table being enriched.", + ) + right_csv: str = Field( + ..., + description="Absolute path to the right CSV. This is the lookup/reference table. Its columns are added to matching left rows; unmatched left rows get nulls.", + ) merge_on_left: str | None = Field( - default=None, description="Optional column name in left table for merge key." + default=None, + description="Only set if you expect some exact string matches on the chosen column or want to draw special attention of LLM agents to this particular column. Fine to leave unspecified in all other cases.", ) merge_on_right: str | None = Field( - default=None, description="Optional column name in right table for merge key." + default=None, + description="Only set if you expect some exact string matches on the chosen column or want to draw special attention of LLM agents to this particular column. Fine to leave unspecified in all other cases.", ) use_web_search: Literal["auto", "yes", "no"] | None = Field( default=None, description='Control web search: "auto", "yes", or "no".' ) relationship_type: Literal["many_to_one", "one_to_one"] | None = Field( default=None, - description='Optional. Control merge relationship type: "many_to_one" (default) allows multiple left rows to match one right row, "one_to_one" enforces unique matching between left and right rows.', + description="Leave unset for the default many_to_one, which is correct in most cases. many_to_one: multiple left rows can match one right row (e.g. products → companies). one_to_one: each left row matches at most one right row AND vice versa. Only use one_to_one when both tables represent unique entities of the same kind.", ) @field_validator("left_csv", "right_csv") @@ -517,10 +525,22 @@ async def everyrow_merge(params: MergeInput) -> list[TextContent]: Merge combines two tables even when keys don't match exactly. The LLM performs research and reasoning to identify which rows should be joined. + left_csv = the table being enriched (ALL its rows appear in the output). + right_csv = the lookup/reference table (its columns are appended to matches). + + IMPORTANT defaults — omit parameters when unsure: + - merge_on_left/merge_on_right: only set if you expect exact string matches on + the chosen columns or want to draw agent attention to them. Fine to omit. + - relationship_type: defaults to many_to_one, which is correct in most cases. + Only set one_to_one when both tables have unique entities of the same kind. + Examples: - - Match software products to parent companies (Photoshop -> Adobe) - - Match clinical trial sponsors to pharma companies (Genentech -> Roche) - - Join contact lists with different name formats + - Match software products (left, enriched) to parent companies (right, lookup): + Photoshop -> Adobe. relationship_type: many_to_one (many products per company). + - Match clinical trial sponsors (left) to pharma companies (right): + Genentech -> Roche. relationship_type: many_to_one. + - Join two contact lists with different name formats: + relationship_type: one_to_one (each person appears once in each list). This function submits the task and returns immediately with a task_id and session_url. After receiving a result from this tool, share the session_url with the user. diff --git a/skills/everyrow-sdk/SKILL.md b/skills/everyrow-sdk/SKILL.md index 91b5a19a..61163688 100644 --- a/skills/everyrow-sdk/SKILL.md +++ b/skills/everyrow-sdk/SKILL.md @@ -116,15 +116,17 @@ Parameters: ``` ### everyrow_merge -Join two CSV files using intelligent entity matching. +Join two CSV files using intelligent entity matching (LEFT JOIN semantics). ``` Parameters: - task: Natural language description of how to match rows -- left_csv: Absolute path to primary CSV -- right_csv: Absolute path to secondary CSV +- left_csv: Absolute path to the left CSV — the table being enriched (ALL its rows are kept in the output) +- right_csv: Absolute path to the right CSV — the lookup/reference table (its columns are appended to matches; unmatched left rows get nulls) - output_path: Directory or full .csv path for output -- merge_on_left: (optional) Column name in left table -- merge_on_right: (optional) Column name in right table +- merge_on_left: (optional) Only set if you expect exact string matches on the chosen column or want to draw agent attention to it. Fine to omit. +- merge_on_right: (optional) Only set if you expect exact string matches on the chosen column or want to draw agent attention to it. Fine to omit. +- relationship_type: (optional) Defaults to "many_to_one", which is correct in most cases (e.g. products → companies). Only set "one_to_one" when both tables have unique entities of the same kind. +- use_web_search: (optional) "auto" (default), "yes", or "no" ``` ### everyrow_agent @@ -225,22 +227,22 @@ Parameters: `input`, `equivalence_relation`, `strategy`, `strategy_prompt`, `ses ### merge - Merge tables with AI matching -Join two tables when the keys don't match exactly. The AI knows "Photoshop" belongs to "Adobe" and "Genentech" is a Roche subsidiary: +Join two tables when the keys don't match exactly (LEFT JOIN semantics). The AI knows "Photoshop" belongs to "Adobe" and "Genentech" is a Roche subsidiary: ```python from everyrow.ops import merge result = await merge( task="Match each software product to its parent company", - left_table=software_products, - right_table=approved_suppliers, - merge_on_left="software_name", - merge_on_right="company_name", + left_table=software_products, # table being enriched — all rows kept + right_table=approved_suppliers, # lookup/reference table — columns appended to matches + # merge_on_left/merge_on_right: omit unless you expect exact string matches + # on the chosen columns or want to draw agent attention to them. ) print(result.data.head()) ``` -Parameters: `task`, `left_table`, `right_table`, `merge_on_left`, `merge_on_right`, `session` +Parameters: `task`, `left_table`, `right_table`, `merge_on_left`, `merge_on_right`, `relationship_type`, `use_web_search`, `session` ### screen - Evaluate and filter rows diff --git a/src/everyrow/ops.py b/src/everyrow/ops.py index 097f4325..b58a6eaa 100644 --- a/src/everyrow/ops.py +++ b/src/everyrow/ops.py @@ -582,17 +582,17 @@ async def merge( use_web_search: Literal["auto", "yes", "no"] | None = None, relationship_type: Literal["many_to_one", "one_to_one"] | None = None, ) -> MergeResult: - """Merge two tables using AI. + """Merge two tables using AI (LEFT JOIN semantics). Args: task: The task description for the merge operation session: Optional session. If not provided, one will be created automatically. - left_table: The left table to merge (DataFrame, UUID, or TableResult) - right_table: The right table to merge (DataFrame, UUID, or TableResult) - merge_on_left: Optional column name in left table to merge on - merge_on_right: Optional column name in right table to merge on - use_web_search: Optional. Control web search behavior: "auto" tries LLM merge first then conditionally searches, "no" skips web search entirely, "yes" forces web search on every row. Defaults to "auto" if not provided. - relationship_type: Optional. Control merge relationship type: "many_to_one" (default) allows multiple left rows to match one right row, "one_to_one" enforces unique matching between left and right rows. + left_table: The table being enriched — all its rows are kept in the output (DataFrame, UUID, or TableResult) + right_table: The lookup/reference table — its columns are appended to matches; unmatched left rows get nulls (DataFrame, UUID, or TableResult) + merge_on_left: Only set if you expect exact string matches on this column or want to draw agent attention to it. Auto-detected if omitted. + merge_on_right: Only set if you expect exact string matches on this column or want to draw agent attention to it. Auto-detected if omitted. + use_web_search: Control web search behavior: "auto" (default) tries LLM merge first then conditionally searches, "no" skips web search entirely, "yes" forces web search on every row. + relationship_type: Defaults to "many_to_one", which is correct in most cases (multiple left rows can match one right row, e.g. products → companies). Only use "one_to_one" when both tables have unique entities of the same kind. Returns: MergeResult containing the merged table and match breakdown by method (exact, fuzzy, llm, web)