Modules

Adapters for loading and saving data.

Initially we have CSV files locally, and Google Docs Spreadsheets.

`GSheetAdapter` ¶

Source code in src/sortition_algorithms/adapters.py

class GSheetAdapter:
    scope: ClassVar = [
        "https://spreadsheets.google.com/feeds",
        "https://www.googleapis.com/auth/drive",
    ]
    hl_light_blue: ClassVar = {
        "backgroundColor": {
            "red": 153 / 255,
            "green": 204 / 255,
            "blue": 255 / 255,
        }
    }
    hl_orange: ClassVar = {"backgroundColor": {"red": 5, "green": 2.5, "blue": 0}}

    def __init__(self, auth_json_path: Path, gen_rem_tab: str = "on") -> None:
        self.auth_json_path = auth_json_path
        self._client: gspread.client.Client | None = None
        self._spreadsheet: gspread.Spreadsheet | None = None
        self.original_selected_tab_name = "Original Selected - output - "
        self.selected_tab_name = "Selected"
        self.columns_selected_first = "C"
        self.column_selected_blank_num = 6
        self.remaining_tab_name = "Remaining - output - "
        self.new_tab_default_size_rows = 2
        self.new_tab_default_size_cols = 40
        self.g_sheet_name = ""
        self._messages: list[str] = []
        self.features_loaded = False
        self.people_loaded = False
        self.gen_rem_tab = gen_rem_tab  # Added for checkbox.

    def messages(self) -> list[str]:
        """Return accumulated messages and reset"""
        messages = self._messages
        self._messages = []
        return messages

    @property
    def client(self) -> gspread.client.Client:
        if self._client is None:
            creds = ServiceAccountCredentials.from_json_keyfile_name(
                str(self.auth_json_path),
                self.scope,
            )
            self._client = gspread.authorize(creds)
        return self._client

    @property
    def spreadsheet(self) -> gspread.Spreadsheet:
        if self._spreadsheet is None:
            self._spreadsheet = self.client.open(self.g_sheet_name)
            self._messages.append(f"Opened Google Sheet: '{self.g_sheet_name}'. ")
        return self._spreadsheet

    def _tab_exists(self, tab_name: str) -> bool:
        if self.spreadsheet is None:
            return False
        tab_list = self.spreadsheet.worksheets()
        return any(tab.title == tab_name for tab in tab_list)

    def _clear_or_create_tab(self, tab_name: str, other_tab_name: str, inc: int) -> gspread.Worksheet:
        # this now does not clear data but increments the sheet number...
        num = 0
        tab_ready: gspread.Worksheet | None = None
        tab_name_new = f"{tab_name}{num}"
        other_tab_name_new = f"{other_tab_name}{num}"
        while tab_ready is None:
            if self._tab_exists(tab_name_new) or self._tab_exists(other_tab_name_new):
                num += 1
                tab_name_new = f"{tab_name}{num}"
                other_tab_name_new = f"{other_tab_name}{num}"
            else:
                if inc == -1:
                    tab_name_new = f"{tab_name}{num - 1}"
                tab_ready = self.spreadsheet.add_worksheet(
                    title=tab_name_new,
                    rows=self.new_tab_default_size_rows,
                    cols=self.new_tab_default_size_cols,
                )
        return tab_ready

    def load_features(self, g_sheet_name: str, feature_tab_name: str) -> tuple[FeatureCollection | None, list[str]]:
        self.g_sheet_name = g_sheet_name
        features: FeatureCollection | None = None
        try:
            if not self._tab_exists(feature_tab_name):
                self._messages.append(f"Error in Google sheet: no tab called '{feature_tab_name}' found. ")
                return None, self.messages()
        except gspread.SpreadsheetNotFound:
            self._messages.append(f"Google spreadsheet not found: {self.g_sheet_name}. ")
            return None, self.messages()
        tab_features = self.spreadsheet.worksheet(feature_tab_name)
        feature_head = tab_features.row_values(1)
        feature_body = _stringify_records(tab_features.get_all_records(expected_headers=[]))
        features, msgs = read_in_features(feature_head, feature_body)
        self.features_loaded = True
        self._messages += msgs
        return features, self.messages()

    def load_people(
        self,
        respondents_tab_name: str,
        settings: Settings,
        features: FeatureCollection,
    ) -> tuple[People | None, list[str]]:
        self._messages = []
        people: People | None = None
        try:
            if not self._tab_exists(respondents_tab_name):
                self._messages.append(
                    f"Error in Google sheet: no tab called '{respondents_tab_name}' found. ",
                )
                return None, self.messages()
        except gspread.SpreadsheetNotFound:
            self._messages.append(f"Google spreadsheet not found: {self.g_sheet_name}. ")
            return None, self.messages()

        tab_people = self.spreadsheet.worksheet(respondents_tab_name)
        # if we don't read this in here we can't check if there are 2 columns with the same name
        people_head = tab_people.row_values(1)
        # the numericise_ignore doesn't convert the phone numbers to ints...
        # 1 Oct 2024: the final argument with expected_headers is to deal with the fact that
        # updated versions of gspread can't cope with duplicate headers
        people_body = _stringify_records(
            tab_people.get_all_records(
                numericise_ignore=["all"],
                expected_headers=[],
            )
        )
        self._messages.append(f"Reading in '{respondents_tab_name}' tab in above Google sheet.")
        people, msgs = read_in_people(people_head, people_body, features, settings)
        self._messages += msgs
        self.people_loaded = True
        return people, self.messages()

    def output_selected_remaining(
        self,
        people_selected_rows: list[list[str]],
        people_remaining_rows: list[list[str]],
        settings: Settings,
    ) -> list[int]:
        tab_original_selected = self._clear_or_create_tab(
            self.original_selected_tab_name,
            self.remaining_tab_name,
            0,
        )
        tab_original_selected.update(people_selected_rows)
        tab_original_selected.format("A1:U1", self.hl_light_blue)
        dupes: list[int] = []
        if self.gen_rem_tab == "on":
            tab_remaining = self._clear_or_create_tab(
                self.remaining_tab_name,
                self.original_selected_tab_name,
                -1,
            )
            tab_remaining.update(people_remaining_rows)
            tab_remaining.format("A1:U1", self.hl_light_blue)
            # highlight any people in remaining tab at the same address
            # TODO: do we ever actually hit this code? We should have deleted
            # all the people who might have been duplicates in selected_remaining_tables()
            if settings.check_same_address:
                address_cols: list[int] = [tab_remaining.find(csa).col for csa in settings.check_same_address_columns]  # type: ignore[union-attr]
                dupes_set: set[int] = set()
                n = len(people_remaining_rows)
                for i in range(n):
                    rowrem1 = people_remaining_rows[i]
                    for j in range(i + 1, n):
                        rowrem2 = people_remaining_rows[j]
                        if rowrem1 != rowrem2 and all(rowrem1[col] == rowrem2[col] for col in address_cols):
                            dupes_set.add(i + 1)
                            dupes_set.add(j + 1)
                dupes = sorted(dupes_set)
                for i in range(min(30, len(dupes))):
                    tab_remaining.format(str(dupes[i]), self.hl_orange)
        return dupes

    def output_multi_selections(
        self,
        multi_selections: list[list[str]],
    ) -> None:
        assert self.gen_rem_tab == "off"
        tab_original_selected = self._clear_or_create_tab(
            self.original_selected_tab_name,
            "ignoreme",
            0,
        )
        tab_original_selected.update(multi_selections)
        tab_original_selected.format("A1:U1", self.hl_light_blue)

`messages()` ¶

Return accumulated messages and reset

Source code in src/sortition_algorithms/adapters.py

def messages(self) -> list[str]:
    """Return accumulated messages and reset"""
    messages = self._messages
    self._messages = []
    return messages

`find_any_committee(features, people, number_people_wanted, settings)` ¶

Find any single feasible committee that satisfies the quotas.

Parameters:

Name	Type	Description	Default
`features`	`FeatureCollection`	FeatureCollection with min/max quotas	required
`people`	`People`	People object with pool members	required
`number_people_wanted`	`int`	desired size of the panel	required
`settings`	`Settings`	Settings object containing configuration	required

Returns:

Type	Description
`tuple[list[frozenset[str]], list[str]]`	tuple of (list containing one committee as frozenset of person_ids, empty list of messages)

Raises:

Type	Description
`InfeasibleQuotasError`	If quotas are infeasible
`SelectionError`	If solver fails for other reasons

Source code in src/sortition_algorithms/committee_generation.py

def find_any_committee(
    features: FeatureCollection,
    people: People,
    number_people_wanted: int,
    settings: Settings,
) -> tuple[list[frozenset[str]], list[str]]:
    """Find any single feasible committee that satisfies the quotas.

    Args:
        features: FeatureCollection with min/max quotas
        people: People object with pool members
        number_people_wanted: desired size of the panel
        settings: Settings object containing configuration

    Returns:
        tuple of (list containing one committee as frozenset of person_ids, empty list of messages)

    Raises:
        InfeasibleQuotasError: If quotas are infeasible
        SelectionError: If solver fails for other reasons
    """
    model, agent_vars = _setup_committee_generation(features, people, number_people_wanted, settings)
    committee = _ilp_results_to_committee(agent_vars)
    return [committee], []

`find_distribution_leximin(features, people, number_people_wanted, settings)` ¶

Find a distribution over feasible committees that maximizes the minimum probability of an agent being selected (just like maximin), but breaks ties to maximize the second-lowest probability, breaks further ties to maximize the third-lowest probability and so forth.

Parameters:

Name	Type	Description	Default
`features`	`FeatureCollection`	FeatureCollection with min/max quotas	required
`people`	`People`	People object with pool members	required
`number_people_wanted`	`int`	desired size of the panel	required
`settings`	`Settings`	Settings object containing configuration	required

Returns:

Type	Description
`list[frozenset[str]]`	tuple of (committees, probabilities, output_lines)
`list[float]`	committees: list of feasible committees (frozenset of agent IDs)
`list[str]`	probabilities: list of probabilities for each committee
`tuple[list[frozenset[str]], list[float], list[str]]`	output_lines: list of debug strings

Raises:

Type	Description
`RuntimeError`	If Gurobi is not available

Source code in src/sortition_algorithms/committee_generation.py

def find_distribution_leximin(
    features: FeatureCollection,
    people: People,
    number_people_wanted: int,
    settings: Settings,
) -> tuple[list[frozenset[str]], list[float], list[str]]:
    """Find a distribution over feasible committees that maximizes the minimum probability of an agent being selected
    (just like maximin), but breaks ties to maximize the second-lowest probability, breaks further ties to maximize the
    third-lowest probability and so forth.

    Args:
        features: FeatureCollection with min/max quotas
        people: People object with pool members
        number_people_wanted: desired size of the panel
        settings: Settings object containing configuration

    Returns:
        tuple of (committees, probabilities, output_lines)
        - committees: list of feasible committees (frozenset of agent IDs)
        - probabilities: list of probabilities for each committee
        - output_lines: list of debug strings

    Raises:
        RuntimeError: If Gurobi is not available
    """
    if not GUROBI_AVAILABLE:
        msg = "Leximin algorithm requires Gurobi solver which is not available"
        raise RuntimeError(msg)

    output_lines = [print_ret("Using leximin algorithm.")]
    grb.setParam("OutputFlag", 0)

    # Set up an ILP that can be used for discovering new feasible committees
    new_committee_model, agent_vars = _setup_committee_generation(features, people, number_people_wanted, settings)

    # Find initial committees that cover every possible agent
    committees, covered_agents, initial_output = _generate_initial_committees(
        new_committee_model, agent_vars, 3 * people.count
    )
    output_lines += initial_output

    # Run the main leximin optimization loop to fix agent probabilities
    fixed_probabilities = _run_leximin_main_loop(new_committee_model, agent_vars, committees, people, output_lines)

    # Convert fixed agent probabilities to committee probabilities
    probabilities_normalised = _solve_leximin_primal_for_final_probabilities(committees, fixed_probabilities)

    return list(committees), probabilities_normalised, output_lines

`find_distribution_maximin(features, people, number_people_wanted, settings)` ¶

Find a distribution over feasible committees that maximizes the minimum probability of an agent being selected.

Parameters:

Name	Type	Description	Default
`features`	`FeatureCollection`	FeatureCollection with min/max quotas	required
`people`	`People`	People object with pool members	required
`number_people_wanted`	`int`	desired size of the panel	required
`settings`	`Settings`	Settings object containing configuration	required

Returns:

Type	Description
`list[frozenset[str]]`	tuple of (committees, probabilities, output_lines)
`list[float]`	committees: list of feasible committees (frozenset of agent IDs)
`list[str]`	probabilities: list of probabilities for each committee
`tuple[list[frozenset[str]], list[float], list[str]]`	output_lines: list of debug strings

Source code in src/sortition_algorithms/committee_generation.py

def find_distribution_maximin(
    features: FeatureCollection,
    people: People,
    number_people_wanted: int,
    settings: Settings,
) -> tuple[list[frozenset[str]], list[float], list[str]]:
    """Find a distribution over feasible committees that maximizes the minimum probability of an agent being selected.

    Args:
        features: FeatureCollection with min/max quotas
        people: People object with pool members
        number_people_wanted: desired size of the panel
        settings: Settings object containing configuration

    Returns:
        tuple of (committees, probabilities, output_lines)
        - committees: list of feasible committees (frozenset of agent IDs)
        - probabilities: list of probabilities for each committee
        - output_lines: list of debug strings
    """
    output_lines = [print_ret("Using maximin algorithm.")]

    # Set up an ILP that can be used for discovering new feasible committees
    new_committee_model, agent_vars = _setup_committee_generation(features, people, number_people_wanted, settings)

    # Find initial committees that cover every possible agent
    committees, covered_agents, initial_output = _generate_initial_committees(
        new_committee_model, agent_vars, people.count
    )
    output_lines += initial_output

    # Set up the incremental LP model for column generation
    incremental_model, incr_agent_vars, upper_bound_var = _setup_maximin_incremental_model(committees, covered_agents)

    # Run the main optimization loop
    return _run_maximin_optimization_loop(
        new_committee_model,
        agent_vars,
        incremental_model,
        incr_agent_vars,
        upper_bound_var,
        committees,
        covered_agents,
        output_lines,
    )

`find_distribution_nash(features, people, number_people_wanted, settings)` ¶

Find a distribution over feasible committees that maximizes the Nash welfare, i.e., the product of selection probabilities over all persons.

Parameters:

Name	Type	Description	Default
`features`	`FeatureCollection`	FeatureCollection with min/max quotas	required
`people`	`People`	People object with pool members	required
`number_people_wanted`	`int`	desired size of the panel	required
`settings`	`Settings`	Settings object containing configuration	required

Returns:

Type	Description
`list[frozenset[str]]`	tuple of (committees, probabilities, output_lines)
`list[float]`	committees: list of feasible committees (frozenset of agent IDs)
`list[str]`	probabilities: list of probabilities for each committee
`tuple[list[frozenset[str]], list[float], list[str]]`	output_lines: list of debug strings

The algorithm maximizes the product of selection probabilities Πᵢ pᵢ by equivalently maximizing log(Πᵢ pᵢ) = Σᵢ log(pᵢ). If some person i is not included in any feasible committee, their pᵢ is 0, and this sum is -∞. We maximize Σᵢ log(pᵢ) where i is restricted to range over persons that can possibly be included.

Source code in src/sortition_algorithms/committee_generation.py

def find_distribution_nash(
    features: FeatureCollection,
    people: People,
    number_people_wanted: int,
    settings: Settings,
) -> tuple[list[frozenset[str]], list[float], list[str]]:
    """Find a distribution over feasible committees that maximizes the Nash welfare, i.e., the product of
    selection probabilities over all persons.

    Args:
        features: FeatureCollection with min/max quotas
        people: People object with pool members
        number_people_wanted: desired size of the panel
        settings: Settings object containing configuration

    Returns:
        tuple of (committees, probabilities, output_lines)
        - committees: list of feasible committees (frozenset of agent IDs)
        - probabilities: list of probabilities for each committee
        - output_lines: list of debug strings

    The algorithm maximizes the product of selection probabilities Πᵢ pᵢ by equivalently maximizing
    log(Πᵢ pᵢ) = Σᵢ log(pᵢ). If some person i is not included in any feasible committee, their pᵢ is 0, and
    this sum is -∞. We maximize Σᵢ log(pᵢ) where i is restricted to range over persons that can possibly be included.
    """
    output_lines = [print_ret("Using Nash algorithm.")]

    # Set up an ILP used for discovering new feasible committees
    new_committee_model, agent_vars = _setup_committee_generation(features, people, number_people_wanted, settings)

    # Find initial committees that include every possible agent
    committee_set, covered_agents, initial_output = _generate_initial_committees(
        new_committee_model, agent_vars, 2 * people.count
    )
    committees = list(committee_set)
    output_lines += initial_output

    # Map the covered agents to indices in a list for easier matrix representation
    entitlements, contributes_to_entitlement = _define_entitlements(covered_agents)

    # Run the main Nash welfare optimization loop
    return _run_nash_optimization_loop(
        new_committee_model,
        agent_vars,
        committees,
        entitlements,
        contributes_to_entitlement,
        covered_agents,
        number_people_wanted,
        output_lines,
    )

`standardize_distribution(committees, probabilities)` ¶

Remove committees with zero probability and renormalize.

Parameters:

Name	Type	Description	Default
`committees`	`list[frozenset[str]]`	list of committees	required
`probabilities`	`list[float]`	corresponding probabilities	required

Returns:

Type	Description
`tuple[list[frozenset[str]], list[float]]`	tuple of (filtered_committees, normalized_probabilities)

Source code in src/sortition_algorithms/committee_generation.py

def standardize_distribution(
    committees: list[frozenset[str]],
    probabilities: list[float],
) -> tuple[list[frozenset[str]], list[float]]:
    """Remove committees with zero probability and renormalize.

    Args:
        committees: list of committees
        probabilities: corresponding probabilities

    Returns:
        tuple of (filtered_committees, normalized_probabilities)
    """
    assert len(committees) == len(probabilities)
    new_committees = []
    new_probabilities = []
    for committee, prob in zip(committees, probabilities, strict=False):
        if prob >= EPS2:
            new_committees.append(committee)
            new_probabilities.append(prob)
    prob_sum = sum(new_probabilities)
    new_probabilities = [prob / prob_sum for prob in new_probabilities]
    return new_committees, new_probabilities

`find_random_sample(features, people, number_people_wanted, settings, selection_algorithm='maximin', test_selection=False, number_selections=1)` ¶

Main algorithm to find one or multiple random committees.

Parameters:

Name	Type	Description	Default
`features`	`FeatureCollection`	FeatureCollection with min/max quotas	required
`people`	`People`	People object with pool members	required
`number_people_wanted`	`int`	desired size of the panel	required
`settings`	`Settings`	Settings object containing configuration	required
`selection_algorithm`	`str`	one of "legacy", "maximin", "leximin", or "nash"	`'maximin'`
`test_selection`	`bool`	if set, do not do a random selection, but just return some valid panel. Useful for quickly testing whether quotas are satisfiable, but should always be false for actual selection!	`False`
`number_selections`	`int`	how many panels to return. Most of the time, this should be set to 1, which means that a single panel is chosen. When specifying a value n ≥ 2, the function will return a list of length n, containing multiple panels (some panels might be repeated in the list). In this case the eventual panel should be drawn uniformly at random from the returned list.	`1`

Returns:

Type	Description
`list[frozenset[str]]`	tuple of (committee_lottery, output_lines)
`list[str]`	committee_lottery: list of committees, where each committee is a frozen set of pool member ids
`tuple[list[frozenset[str]], list[str]]`	output_lines: list of debug strings

Raises:

Type	Description
`InfeasibleQuotasError`	if the quotas cannot be satisfied, which includes a suggestion for how to modify them
`SelectionError`	in multiple other failure cases
`ValueError`	for invalid parameters
`RuntimeError`	if required solver is not available

Source code in src/sortition_algorithms/core.py

def find_random_sample(
    features: FeatureCollection,
    people: People,
    number_people_wanted: int,
    settings: Settings,
    selection_algorithm: str = "maximin",
    test_selection: bool = False,
    number_selections: int = 1,
) -> tuple[list[frozenset[str]], list[str]]:
    """Main algorithm to find one or multiple random committees.

    Args:
        features: FeatureCollection with min/max quotas
        people: People object with pool members
        number_people_wanted: desired size of the panel
        settings: Settings object containing configuration
        selection_algorithm: one of "legacy", "maximin", "leximin", or "nash"
        test_selection: if set, do not do a random selection, but just return some valid panel.
            Useful for quickly testing whether quotas are satisfiable, but should always be false for actual selection!
        number_selections: how many panels to return. Most of the time, this should be set to 1, which means that
            a single panel is chosen. When specifying a value n ≥ 2, the function will return a list of length n,
            containing multiple panels (some panels might be repeated in the list). In this case the eventual panel
            should be drawn uniformly at random from the returned list.

    Returns:
        tuple of (committee_lottery, output_lines)
        - committee_lottery: list of committees, where each committee is a frozen set of pool member ids
        - output_lines: list of debug strings

    Raises:
        InfeasibleQuotasError: if the quotas cannot be satisfied, which includes a suggestion for how to modify them
        SelectionError: in multiple other failure cases
        ValueError: for invalid parameters
        RuntimeError: if required solver is not available
    """
    # Input validation
    if test_selection and number_selections != 1:
        msg = (
            "Running the test selection does not support generating a transparent lottery, so, if "
            "`test_selection` is true, `number_selections` must be 1."
        )
        raise ValueError(msg)

    if selection_algorithm == "legacy" and number_selections != 1:
        msg = (
            "Currently, the legacy algorithm does not support generating a transparent lottery, "
            "so `number_selections` must be set to 1."
        )
        raise ValueError(msg)

    # Quick test selection using find_any_committee
    if test_selection:
        print("Running test selection.")
        return find_any_committee(features, people, number_people_wanted, settings)

    output_lines = []

    # Check if Gurobi is available for leximin
    if selection_algorithm == "leximin" and not GUROBI_AVAILABLE:
        output_lines.append(
            print_ret(
                "The leximin algorithm requires the optimization library Gurobi to be installed "
                "(commercial, free academic licenses available). Switching to the simpler "
                "maximin algorithm, which can be run using open source solvers."
            )
        )
        selection_algorithm = "maximin"

    # Route to appropriate algorithm
    if selection_algorithm == "legacy":
        # Import here to avoid circular imports
        from sortition_algorithms.find_sample import find_random_sample_legacy

        return find_random_sample_legacy(
            people,
            features,
            number_people_wanted,
            settings.check_same_address,
            settings.check_same_address_columns,
        )
    elif selection_algorithm == "leximin":
        committees, probabilities, new_output_lines = find_distribution_leximin(
            features, people, number_people_wanted, settings
        )
    elif selection_algorithm == "maximin":
        committees, probabilities, new_output_lines = find_distribution_maximin(
            features, people, number_people_wanted, settings
        )
    elif selection_algorithm == "nash":
        committees, probabilities, new_output_lines = find_distribution_nash(
            features, people, number_people_wanted, settings
        )
    else:
        msg = (
            f"Unknown selection algorithm {selection_algorithm!r}, must be either 'legacy', 'leximin', "
            f"'maximin', or 'nash'."
        )
        raise ValueError(msg)

    # Post-process the distribution
    committees, probabilities = standardize_distribution(committees, probabilities)
    if len(committees) > people.count:
        print(
            "INFO: The distribution over panels is what is known as a 'basic solution'. There is no reason for concern "
            "about the correctness of your output, but we'd appreciate if you could reach out to panelot"
            f"@paulgoelz.de with the following information: algorithm={selection_algorithm}, "
            f"num_panels={len(committees)}, num_agents={people.count}, min_probs={min(probabilities)}."
        )

    assert len(set(committees)) == len(committees)

    output_lines += new_output_lines
    output_lines += _distribution_stats(people, committees, probabilities)

    # Convert to lottery
    committee_lottery = lottery_rounding(committees, probabilities, number_selections)

    return committee_lottery, output_lines

`lottery_rounding(committees, probabilities, number_selections)` ¶

Convert probability distribution over committees to a discrete lottery.

Parameters:

Name	Type	Description	Default
`committees`	`list[frozenset[str]]`	list of committees	required
`probabilities`	`list[float]`	corresponding probabilities (must sum to 1)	required
`number_selections`	`int`	number of committees to return	required

Returns:

Type	Description
`list[frozenset[str]]`	list of committees (may contain duplicates) of length number_selections

Source code in src/sortition_algorithms/core.py

def lottery_rounding(
    committees: list[frozenset[str]],
    probabilities: list[float],
    number_selections: int,
) -> list[frozenset[str]]:
    """Convert probability distribution over committees to a discrete lottery.

    Args:
        committees: list of committees
        probabilities: corresponding probabilities (must sum to 1)
        number_selections: number of committees to return

    Returns:
        list of committees (may contain duplicates) of length number_selections
    """
    assert len(committees) == len(probabilities)
    assert number_selections >= 1

    num_copies: list[int] = []
    residuals: list[float] = []
    for _, prob in zip(committees, probabilities, strict=False):
        scaled_prob = prob * number_selections
        num_copies.append(int(scaled_prob))  # give lower quotas
        residuals.append(scaled_prob - int(scaled_prob))

    rounded_up_indices = pipage_rounding(list(enumerate(residuals)))
    for committee_index in rounded_up_indices:
        num_copies[committee_index] += 1

    committee_lottery: list[frozenset[str]] = []
    for committee, committee_copies in zip(committees, num_copies, strict=False):
        committee_lottery += [committee for _ in range(committee_copies)]

    return committee_lottery

`pipage_rounding(marginals)` ¶

Pipage rounding algorithm for converting fractional solutions to integer solutions.

Takes a list of (object, probability) pairs and randomly rounds them to a set of objects such that the expected number of times each object appears equals its probability.

Parameters:

Name	Type	Description	Default
`marginals`	`list[tuple[int, float]]`	list of (object, probability) pairs where probabilities sum to an integer	required

Returns:

Type	Description
`list[int]`	list of objects that were selected

Source code in src/sortition_algorithms/core.py

def pipage_rounding(marginals: list[tuple[int, float]]) -> list[int]:
    """Pipage rounding algorithm for converting fractional solutions to integer solutions.

    Takes a list of (object, probability) pairs and randomly rounds them to a set of objects
    such that the expected number of times each object appears equals its probability.

    Args:
        marginals: list of (object, probability) pairs where probabilities sum to an integer

    Returns:
        list of objects that were selected
    """
    assert all(0.0 <= p <= 1.0 for _, p in marginals)

    outcomes: list[int] = []
    while True:
        if len(marginals) == 0:
            return outcomes
        if len(marginals) == 1:
            obj, prob = marginals[0]
            if random_provider().uniform(0.0, 1.0) < prob:
                outcomes.append(obj)
            marginals = []
        else:
            obj0, prob0 = marginals[0]
            if prob0 > 1.0 - EPS2:
                outcomes.append(obj0)
                marginals = marginals[1:]
                continue
            if prob0 < EPS2:
                marginals = marginals[1:]
                continue

            obj1, prob1 = marginals[1]
            if prob1 > 1.0 - EPS2:
                outcomes.append(obj1)
                marginals = [marginals[0]] + marginals[2:]
                continue
            if prob1 < EPS2:
                marginals = [marginals[0]] + marginals[2:]
                continue

            inc0_dec1_amount = min(
                1.0 - prob0, prob1
            )  # maximal amount that prob0 can be increased and prob1 can be decreased
            dec0_inc1_amount = min(prob0, 1.0 - prob1)
            choice_probability = dec0_inc1_amount / (inc0_dec1_amount + dec0_inc1_amount)

            if random_provider().uniform(0.0, 1.0) < choice_probability:  # increase prob0 and decrease prob1
                prob0 += inc0_dec1_amount
                prob1 -= inc0_dec1_amount
            else:
                prob0 -= dec0_inc1_amount
                prob1 += dec0_inc1_amount
            marginals = [(obj0, prob0), (obj1, prob1)] + marginals[2:]

`run_stratification(features, people, number_people_wanted, settings, test_selection=False, number_selections=1)` ¶

Run stratified random selection with retry logic.

Parameters:

Name	Type	Description	Default
`features`	`FeatureCollection`	FeatureCollection with min/max quotas for each feature value	required
`people`	`People`	People object containing the pool of candidates	required
`number_people_wanted`	`int`	Desired size of the panel	required
`settings`	`Settings`	Settings object containing configuration	required
`test_selection`	`bool`	If True, don't randomize (for testing only)	`False`
`number_selections`	`int`	Number of panels to return	`1`

Returns:

Type	Description
`bool`	Tuple of (success, selected_committees, output_lines)
`list[frozenset[str]]`	success: Whether selection succeeded within max attempts
`list[str]`	selected_committees: List of committees (frozensets of person IDs)
`tuple[bool, list[frozenset[str]], list[str]]`	output_lines: Debug and status messages

Raises:

Type	Description
`Exception`	If number_people_wanted is outside valid range for any feature
`ValueError`	For invalid parameters
`RuntimeError`	If required solver is not available
`InfeasibleQuotasError`	If quotas cannot be satisfied

Source code in src/sortition_algorithms/core.py

def run_stratification(
    features: FeatureCollection,
    people: People,
    number_people_wanted: int,
    settings: Settings,
    test_selection: bool = False,
    number_selections: int = 1,
) -> tuple[bool, list[frozenset[str]], list[str]]:
    """Run stratified random selection with retry logic.

    Args:
        features: FeatureCollection with min/max quotas for each feature value
        people: People object containing the pool of candidates
        number_people_wanted: Desired size of the panel
        settings: Settings object containing configuration
        test_selection: If True, don't randomize (for testing only)
        number_selections: Number of panels to return

    Returns:
        Tuple of (success, selected_committees, output_lines)
        - success: Whether selection succeeded within max attempts
        - selected_committees: List of committees (frozensets of person IDs)
        - output_lines: Debug and status messages

    Raises:
        Exception: If number_people_wanted is outside valid range for any feature
        ValueError: For invalid parameters
        RuntimeError: If required solver is not available
        InfeasibleQuotasError: If quotas cannot be satisfied
    """
    # Check if desired number is within feature constraints
    features.check_desired(number_people_wanted)

    # Set random seed if specified
    # If the seed is zero or None, we use the secrets module, as it is better
    # from a security point of view
    set_random_provider(settings.random_number_seed)

    success = False
    output_lines = []

    if test_selection:
        output_lines.append(
            "<b style='color: red'>WARNING: Panel is not selected at random! Only use for testing!</b><br>",
        )

    output_lines.append("<b>Initial: (selected = 0)</b>")
    output_lines += _initial_print_category_info(
        features,
        people,
    )
    people_selected: list[frozenset[str]] = []

    tries = 0
    for tries in range(settings.max_attempts):
        people_selected = []

        output_lines.append(f"<b>Trial number: {tries}</b>")

        try:
            people_selected, new_output_lines = find_random_sample(
                features,
                people,
                number_people_wanted,
                settings,
                settings.selection_algorithm,
                test_selection,
                number_selections,
            )
            output_lines += new_output_lines

            # Check if targets were met (only works for number_selections = 1)
            new_output_lines = _print_category_info(
                features,
                people,
                people_selected,
                number_people_wanted,
            )
            success, check_output_lines = _check_category_selected(
                features,
                people,
                people_selected,
                number_selections,
            )

            if success:
                output_lines.append("<b>SUCCESS!!</b> Final:")
                output_lines += new_output_lines + check_output_lines
                break

        except (ValueError, RuntimeError) as err:
            output_lines.append(str(err))
            break
        except errors.InfeasibleQuotasError as err:
            output_lines += err.output
            break
        except errors.SelectionError as serr:
            output_lines.append(f"Failed: Selection Error thrown: {serr}")

    if not success:
        output_lines.append(f"Failed {tries} times... gave up.")

    return success, people_selected, output_lines

`selected_remaining_tables(full_people, people_selected, features, settings)` ¶

write some text

people_selected is a single frozenset[str] - it must be unwrapped before being passed to this function.

Source code in src/sortition_algorithms/core.py

def selected_remaining_tables(
    full_people: People,
    people_selected: frozenset[str],
    features: FeatureCollection,
    settings: Settings,
) -> tuple[list[list[str]], list[list[str]], list[str]]:
    """
    write some text

    people_selected is a single frozenset[str] - it must be unwrapped before being passed
    to this function.
    """
    people_working = deepcopy(full_people)
    output_lines: list[str] = []

    people_selected_rows = person_list_to_table(people_selected, people_working, features, settings)

    # now delete the selected people (and maybe also those at the same address)
    num_same_address_deleted = 0
    for pkey in people_selected:
        # if check address then delete all those at this address (will NOT delete the one we want as well)
        if settings.check_same_address:
            pkey_to_delete = list(people_working.matching_address(pkey, settings.check_same_address_columns))
            num_same_address_deleted += len(pkey_to_delete) + 1
            # then delete this/these people at the same address from the reserve/remaining pool
            people_working.remove_many([pkey, *pkey_to_delete])
        else:
            people_working.remove(pkey)

    # add the columns to keep into remaining people
    # as above all these values are all in people_working but this is tidier...
    people_remaining_rows = person_list_to_table(people_working, people_working, features, settings)
    return people_selected_rows, people_remaining_rows, output_lines

    # TODO: put this code somewhere more suitable
    # maybe in strat app only?
    """
    dupes = self._output_selected_remaining(
        settings,
        people_selected_rows,
        people_remaining_rows,
    )
    if settings.check_same_address and self.gen_rem_tab == "on":
        output_lines.append(
            f"Deleted {num_same_address_deleted} people from remaining file who had the same "
            f"address as selected people.",
        )
        m = min(30, len(dupes))
        output_lines.append(
            f"In the remaining tab there are {len(dupes)} people who share the same address as "
            f"someone else in the tab. We highlighted the first {m} of these. "
            f"The full list of lines is {dupes}",
        )
    """

`FeatureCollection` ¶

A full set of features for a stratification.

The keys here are the names of the features. They could be: gender, age_bracket, education_level etc

The values are FeatureValues objects - the breakdown of the values for a feature.