Data Adapters¶
Data adapters handle loading demographic data and candidate pools from various sources, and exporting selection results back to those sources. The library includes adapters for CSV files and Google Sheets, and you can write custom adapters for other data sources.
Built-in Data Sources¶
CSVFileDataSource¶
The most commonly used adapter for working with local CSV files.
Basic Usage¶
from sortition_algorithms import CSVFileDataSource, SelectionData, Settings
from pathlib import Path
data_source = CSVFileDataSource(
features_file=Path("demographics.csv"),
people_file=Path("candidates.csv"),
selected_file=Path("selected.csv"),
remaining_file=Path("remaining.csv"),
)
select_data = SelectionData(data_source)
settings = Settings()
# Load data
features, report = select_data.load_features()
people, report = select_data.load_people(settings, features)
# Do Selection
# ...
# Export results (after running selection)
data_source.output_selected_remaining(selected_rows, remaining_rows)
Working with String Data¶
For data already in memory:
# Load from string content
features_csv = """feature,value,min,max
Gender,Male,45,55
Gender,Female,45,55"""
people_csv = """id,Name,Gender
p001,Alice,Female
p002,Bob,Male"""
data_source = CSVStringDataSource(features_csv, people_csv)
select_data = SelectionData(data_source)
features, report = select_data.load_features()
people, report = select_data.load_people(Settings(), features)
Full CSV Workflow Example¶
This time for replacements, so we refer to the already selected people.
from sortition_algorithms import (
CSVFileDataSource, run_stratification, selected_remaining_tables, SelectionData, Settings
)
from pathlib import Path
import csv
def csv_selection_workflow():
# Initialize
data_source = CSVFileDataSource(
features_file=Path("demographics.csv"),
people_file=Path("candidates.csv"),
already_selected_file=Path("selected.csv"),
selected_file=Path("replacements.csv"),
remaining_file=Path("remaining.csv"),
)
select_data = SelectionData(data_source)
settings = Settings()
number_wanted=100
# Load data
features, report = select_data.load_features(number_wanted)
print(report.as_text())
people, report = select_data.load_people(settings, features)
print(report.as_text())
already_selected, report = select_data.load_already_selected(settings, features)
print(report.as_text())
# Run selection
success, panels, msgs = run_stratification(
features, people, number_wanted, settings, already_selected=already_selected
)
if success:
# Format results
selected_table, remaining_table, _ = selected_remaining_tables(
people, panels[0], features, settings, already_selected=already_selected
)
# Export results
data_source.output_selected_remaining(selected_rows, remaining_rows)
print(f"Selected {len(panels[0])} people successfully")
else:
print("Selection failed")
print("\n".join(msgs))
GSheetDataSource¶
For organizations using Google Sheets for data management.
Setup Requirements¶
- Google Cloud Project: Create a project in Google Cloud Console
- Enable APIs: Enable Google Sheets API and Google Drive API
- Service Account: Create service account credentials and download JSON key
- Share Spreadsheet: Share your spreadsheet with the service account email address
Basic Usage¶
from sortition_algorithms import GSheetDataSource, SelectionData, Settings
from pathlib import Path
# Initialize with credentials
data_source = GSheetDataSource(
feature_tab_name="Demographics",
people_tab_name="Candidates",
auth_json_path=Path("/secure/path/credentials.json"),
gen_rem_tab=True, # Generate remaining tab
)
data_source.set_g_sheet_name("My Spreadsheet")
select_data = SelectionData(data_source)
# Load data from Google Sheet
features, report = select_data.load_features()
print(report.as_text())
people, report = select_data.load_people(settings, features)
print(report.as_text())
# Here, do selection
# Configure output tabs
data_source.selected_tab_name_stub = "Selected Panel"
data_source.remaining_tab_name_stub = "Reserve Pool"
# Export results (after running selection)
select_data.output_selected_remaining(selected_rows, remaining_rows, settings)
Full Google Sheets Workflow¶
This time for replacements, so we refer to the already selected people.
from sortition_algorithms import GSheetDataSource, SelectionData, run_stratification, selected_remaining_tables, Settings
from pathlib import Path
def gsheet_selection_workflow():
# Initialize
data_source = GSheetDataSource(
feature_tab_name="Demographics",
people_tab_name="Candidates",
already_selected_tab_name="Selected",
auth_json_path=Path("/secure/path/credentials.json"),
gen_rem_tab=True, # Generate remaining tab
)
data_source.set_g_sheet_name("My Spreadsheet")
select_data = SelectionData(data_source)
settings = Settings()
number_wanted = 120
# Load data
adapter.set_g_sheet_name("Citizen Panel 2024")
features, report = adapter.load_features(number_wanted)
if features is None:
print("Failed to load features:", "\n".join(msgs))
return
people, report = adapter.load_people(settings, features)
if people is None:
print("Failed to load people:", "\n".join(msgs))
return
already_selected, report = adapter.load_already_selected(settings, features)
# Run selection
success, panels, report = run_stratification(
features, people, number_wanted, settings, already_selected=already_selected
)
if success:
# Format results
selected_table, remaining_table, _ = selected_remaining_tables(
people, panels[0], features, settings, already_selected=already_selected
)
# Configure output
adapter.selected_tab_name = "Selected Panel"
adapter.remaining_tab_name = "Reserve Pool"
# Export to Google Sheets
dupes, _ = adapter.output_selected_remaining(selected_table, remaining_table, settings)
print(f"Selected {len(panels[0])} people successfully")
if dupes:
print(f"Warning: {len(dupes)} people in remaining pool share addresses")
else:
print("Selection failed:", report.as_text())
Google Sheets Data Format¶
Your spreadsheet should be structured as follows:
Demographics Tab:
| feature | value | min | max |
|---|---|---|---|
| Gender | Male | 45 | 55 |
| Gender | Female | 45 | 55 |
| Age | 18-30 | 20 | 30 |
Note that you can have other columns on the tab - the features import code will ignore them.
Candidates Tab:
| id | Name | Gender | Age | Location | Address | Postcode | |
|---|---|---|---|---|---|---|---|
| p001 | Alice Smith | alice@email.com | Female | 18-30 | Urban | 123 Main St | 12345 |
| p002 | Bob Jones | bob@email.com | Male | 31-50 | Rural | 456 Oak Ave | 67890 |
Writing custom Data Source classes¶
You can create custom data source classes for other data sources like Excel files, SQL databases, or APIs.
AbstractDataSource¶
All data source classes should inherit from AbstractDataSource - and implement the methods it defines:
from sortition_algorithms import RunReport
class AbstractDataSource(abc.ABC):
@abc.abstractmethod
@contextmanager
def read_feature_data(
self, report: RunReport
) -> Generator[tuple[Iterable[str], Iterable[dict[str, str]]], None, None]: ...
@abc.abstractmethod
@contextmanager
def read_people_data(
self, report: RunReport
) -> Generator[tuple[Iterable[str], Iterable[dict[str, str]]], None, None]: ...
@abc.abstractmethod
def write_selected(self, selected: list[list[str]]) -> None: ...
@abc.abstractmethod
def write_remaining(self, remaining: list[list[str]]) -> None: ...
@abc.abstractmethod
def highlight_dupes(self, dupes: list[int]) -> None: ...
Example: Excel Data Source¶
Here's a complete example of an Excel adapter using the openpyxl library:
from pathlib import Path
from typing import Any
import openpyxl
from openpyxl.worksheet.worksheet import Worksheet
from sortition_algorithms import AbstractDataSource, FeatureCollection, People, RunReport, SelectionError, Settings
from sortition_algorithms.features import read_in_features
from sortition_algorithms.people import read_in_people
class ExcelDataSource(AbstractDataSource):
"""DataSource for Excel files using openpyxl."""
def __init__(self, excel_file: Path, feature_tab_name: str, people_tab_name: str) -> None:
self.excel_file = excel_file
self.feature_tab_name = feature_tab_name
self.people_tab_name = people_tab_name
self.selected_tab_name = "selected"
self.remaining_tab_name = "remaining"
@contextmanager
def read_feature_data(
self, report: RunReport
) -> Generator[tuple[Iterable[str], Iterable[dict[str, str]]], None, None]:
"""Load features data from Excel file."""
workbook = openpyxl.load_workbook(self.excel_file)
if self.feature_tab_name not in workbook.sheetnames:
msg = f"Sheet '{self.feature_tab_name}' not found in {excel_file}"
report.add_line(msg)
raise SelectionError(msg, report)
sheet = workbook[self.feature_tab_name]
# Read header row
headers = [cell.value for cell in sheet[1]]
# Read data rows
data = []
for row in sheet.iter_rows(min_row=2, values_only=True):
if any(cell is not None for cell in row): # Skip empty rows
row_dict = {headers[i]: str(row[i]) if row[i] is not None else ""
for i in range(len(headers))}
data.append(row_dict)
yield headers, data
# close the workbook
@contextmanager
def read_people_data(
self, report: RunReport
) -> Generator[tuple[Iterable[str], Iterable[dict[str, str]]], None, None]:
"""Load people from Excel file."""
workbook = openpyxl.load_workbook(excel_file)
if self.people_tab_name not in workbook.sheetnames:
msg = f"Sheet '{self.people_tab_name}' not found in {excel_file}"
report.add_line(msg)
raise SelectionError(msg, report)
sheet = workbook[self.people_tab_name]
# Read header row
headers = [cell.value for cell in sheet[1]]
# Read data rows
data = []
for row in sheet.iter_rows(min_row=2, values_only=True):
if any(cell is not None for cell in row): # Skip empty rows
row_dict = {headers[i]: str(row[i]) if row[i] is not None else ""
for i in range(len(headers))}
data.append(row_dict)
yield headers, data
def write_selected(self, selected: list[list[str]]) -> None:
selected_ws = workbook.create_sheet(self.selected_tab_name)
self._write_data_to_sheet(selected_ws, selected_rows)
workbook.save(self.excel_file)
def write_remaining(self, remaining: list[list[str]]) -> None:
remaining_ws = workbook.create_sheet(self.remaining_tab_name)
self._write_data_to_sheet(remaining_ws, remaining_rows)
workbook.save(self.excel_file)
def highlight_dupes(self, dupes: list[int]) -> None:
# not implemented
pass
def _write_data_to_sheet(self, sheet: Worksheet, data: list[list[str]]) -> None:
"""Write data rows to worksheet."""
for row_idx, row_data in enumerate(data, 1):
for col_idx, cell_value in enumerate(row_data, 1):
sheet.cell(row=row_idx, column=col_idx, value=cell_value)
# Style header row
if data:
for cell in sheet[1]:
cell.font = openpyxl.styles.Font(bold=True)
cell.fill = openpyxl.styles.PatternFill("solid", fgColor="CCCCCC")
# Usage example
def excel_workflow():
data_source = ExcelDataSource(
Path("selection_data.xlsx"),
"Demographics",
"Candidates",
)
select_data = SelectionData(data_source)
settings = Settings()
# Load data
features, _ = select_data.load_features()
people, report = select_data.load_people(settings, features)
# Run selection (assuming you have the selection logic)
success, panels, report = run_stratification(...)
# Export results
select_data.output_selected_remaining(selected_table, remaining_table, settings)
Next Steps¶
- Core Concepts - Understand sortition fundamentals
- API Reference - Complete function documentation
- CLI Usage - Command line interface
- Advanced Usage - Complex scenarios and optimization