Skip to content
235 changes: 235 additions & 0 deletions pokemon_v2/test_models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import csv
import os
import re
from django.test import TestCase
from pokemon_v2.models import *

Expand All @@ -9,3 +12,235 @@ def setUp(self):
def fields_are_valid(self):
smell = Ability.objects.get(name="Smell")
self.assertEqual(smell.generation_id, 3)


class CSVResourceNameValidationTestCase(TestCase):
"""
Test that all resource identifiers in CSV files follow ASCII slug format.

Resource identifiers are used in API URLs and should be URL-safe ASCII slugs
(lowercase letters, numbers, and hyphens only).

This test validates the data source (CSV files) before it's loaded into the database.
"""

# Pattern for valid resource identifiers: lowercase letters, numbers, and hyphens only
VALID_IDENTIFIER_PATTERN = re.compile(r"^[a-z0-9-]+$")

# CSV files that contain an 'identifier' column to validate
# Format: (filename, identifier_column_name)
CSV_FILES_TO_VALIDATE = [
("abilities.csv", "identifier"),
("berry_firmness.csv", "identifier"),
("conquest_episodes.csv", "identifier"),
("conquest_kingdoms.csv", "identifier"),
("conquest_move_displacements.csv", "identifier"),
("conquest_move_ranges.csv", "identifier"),
("conquest_stats.csv", "identifier"),
("conquest_warrior_archetypes.csv", "identifier"),
("conquest_warrior_skills.csv", "identifier"),
("conquest_warrior_stats.csv", "identifier"),
("conquest_warriors.csv", "identifier"),
("contest_types.csv", "identifier"),
("egg_groups.csv", "identifier"),
("encounter_conditions.csv", "identifier"),
("encounter_condition_values.csv", "identifier"),
("encounter_methods.csv", "identifier"),
("evolution_triggers.csv", "identifier"),
("genders.csv", "identifier"),
("generations.csv", "identifier"),
("growth_rates.csv", "identifier"),
("items.csv", "identifier"),
("item_categories.csv", "identifier"),
("item_flags.csv", "identifier"),
("item_fling_effects.csv", "identifier"),
("item_pockets.csv", "identifier"),
("languages.csv", "identifier"),
("locations.csv", "identifier"),
("location_areas.csv", "identifier"),
("moves.csv", "identifier"),
("move_battle_styles.csv", "identifier"),
("move_damage_classes.csv", "identifier"),
("move_flags.csv", "identifier"),
("move_meta_ailments.csv", "identifier"),
("move_meta_categories.csv", "identifier"),
("move_targets.csv", "identifier"),
("natures.csv", "identifier"),
("pal_park_areas.csv", "identifier"),
("pokeathlon_stats.csv", "identifier"),
("pokedexes.csv", "identifier"),
("pokemon.csv", "identifier"),
("pokemon_colors.csv", "identifier"),
("pokemon_forms.csv", "identifier"),
("pokemon_habitats.csv", "identifier"),
("pokemon_move_methods.csv", "identifier"),
("pokemon_shapes.csv", "identifier"),
("pokemon_species.csv", "identifier"),
("regions.csv", "identifier"),
("stats.csv", "identifier"),
("types.csv", "identifier"),
("versions.csv", "identifier"),
("version_groups.csv", "identifier"),
Copy link
Copy Markdown
Member

@jemarq04 jemarq04 Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These all have the same name for the column: identifier. Maybe just have this be a list of strings (the base file names) and update things to match?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, maybe it's better. When I started create the test, I didn't know if all the name had the same column name. That's why I did it this way. I will make the modification.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change is done ;)

]

def get_csv_path(self, filename):
"""Get the absolute path to a CSV file in data/v2/csv/"""
from django.conf import settings
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why import in this function every time it's called instead of at the top of the file?

Also, considering the function can be put in one line and still be easily read I think maybe we can just retrieve the path directly in the test method.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the feedback! You're absolutely right about both points. For the import, I apologize - I should have placed it at the top of the file. I will make the changes as you suggested ;).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change is done ;)


base_dir = settings.BASE_DIR
return os.path.join(base_dir, "data", "v2", "csv", filename)

def test_all_csv_identifiers_are_ascii_slugs(self):
"""
Validate that all resource identifiers in CSV files follow the ASCII slug format.

Identifiers should only contain:
- Lowercase letters (a-z)
- Numbers (0-9)
- Hyphens (-)

This test will fail if any CSV contains identifiers with:
- Unicode characters (ñ, ', é, etc.)
- Uppercase letters
- Spaces
- Special characters (&, (), ', etc.)
"""
violations = []
missing_files = []

for filename, identifier_column in self.CSV_FILES_TO_VALIDATE:
csv_path = self.get_csv_path(filename)

# Track missing files to report at the end
if not os.path.exists(csv_path):
missing_files.append(filename)
continue

try:
with open(csv_path, "r", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)

# Check if the identifier column exists
if identifier_column not in reader.fieldnames:
violations.append(
{
"file": filename,
"row": "N/A",
"id": "N/A",
"identifier": f"Column '{identifier_column}' not found",
}
)
continue

for row_num, row in enumerate(
reader, start=2
): # Start at 2 (after header)
identifier = row.get(identifier_column, "").strip()

# Skip empty identifiers
if not identifier:
continue

# Check if identifier matches the pattern
if not self.VALID_IDENTIFIER_PATTERN.match(identifier):
violations.append(
{
"file": filename,
"row": row_num,
"id": row.get("id", "N/A"),
"identifier": identifier,
}
)

except Exception as e:
violations.append(
{
"file": filename,
"row": "N/A",
"id": "N/A",
"identifier": f"Error reading file: {str(e)}",
}
)

# If there are violations or missing files, create a detailed error message
if violations or missing_files:
error_lines = []

# Report missing files first
if missing_files:
error_lines.append("\n\nMissing CSV files:")
for filename in missing_files:
error_lines.append(f" - {filename}")
error_lines.append(
"\nAll CSV files listed in CSV_FILES_TO_VALIDATE must exist."
)

# Report violations
if violations:
error_lines.append(
"\n\nFound {} resource(s) with invalid identifiers (not ASCII slugs):".format(
len(violations)
)
)
error_lines.append("\nIdentifiers must match pattern: ^[a-z0-9-]+$")
error_lines.append("\nInvalid identifiers found in CSV files:")

for v in violations:
error_lines.append(
" - {file} (row {row}, id={id}): {identifier}".format(**v)
)

error_lines.append(
"\nThese identifiers contain invalid characters and must be normalized."
)
error_lines.append(
"Update the CSV files in data/v2/csv/ to fix these identifiers."
)
error_lines.append("\nSuggested fixes:")
error_lines.append(
" - Remove Unicode apostrophes (') and replace with regular hyphens or remove"
)
error_lines.append(" - Remove Unicode letters (ñ → n)")
error_lines.append(
" - Remove parentheses and other special characters"
)
error_lines.append(" - Convert to lowercase")

self.fail("\n".join(error_lines))

def test_identifier_pattern_examples(self):
"""Test that the validation pattern works correctly with example identifiers."""
# Valid identifiers
valid_identifiers = [
"pikachu",
"charizard-mega-x",
"mr-mime",
"ho-oh",
"type-null",
"item-123",
"mega-stone",
]

for identifier in valid_identifiers:
self.assertTrue(
self.VALID_IDENTIFIER_PATTERN.match(identifier),
f"{identifier} should be valid but was rejected",
)

# Invalid identifiers
invalid_identifiers = [
"Pikachu", # Uppercase
"Mr. Mime", # Space and period
"kofu's-wallet", # Unicode apostrophe
"jalapeño", # Unicode ñ
"steel-bottle-(r)", # Parentheses
"b&w-grass-tablecloth", # Ampersand
"farfetch'd", # Apostrophe
"kofu's-wallet", # Regular apostrophe
]

for identifier in invalid_identifiers:
self.assertFalse(
self.VALID_IDENTIFIER_PATTERN.match(identifier),
f"{identifier} should be invalid but was accepted",
)
Loading