Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import csv
import io
import logging
import time
import uuid
from pathlib import Path
from typing import Optional
from typing import Tuple

from django.core.management.base import BaseCommand
from django.db.models import Exists
Expand All @@ -12,17 +17,90 @@

from contentcuration.models import Channel
from contentcuration.models import ContentNode
from contentcuration.models import License


logger = logging.getLogger(__name__)


class LicensingFixesLookup(object):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

praise: Nice separation of LicensingFixesLookup — the CSV-to-license resolution logic is cleanly encapsulated and independently testable.

"""Consolidates logic for reading and processing the licensing fixes from the CSV"""

def __init__(self):
self._lookup = {}
self._license_lookup = {}

def load(self, fp: io.TextIOWrapper):
"""Loads the data from the CSV file, and the necessary license data from the database"""
reader = csv.DictReader(fp)
license_names = set()

# create a lookup index by channel ID from the CSV data
for row in reader:
self._lookup[uuid.UUID(row["channel_id"]).hex] = row
if row["license_name"]:
license_names.add(row["license_name"])

# load all licenses, regardless of whether they are named in the CSV
license_lookup_by_name = {}
for lic in License.objects.all():
self._license_lookup[lic.id] = lic
license_lookup_by_name[lic.license_name] = lic
license_names.discard(lic.license_name)

# ensure we've found all the licenses
if len(license_names):
raise ValueError(f"Could not find all licenses: {license_names}")

# we now are certain all licenses are found
for info in self._lookup.values():
if info["license_name"]:
info["license_id"] = license_lookup_by_name[info["license_name"]].id

def get_info(
self,
channel_id: str,
license_id: Optional[int],
license_description: Optional[str],
copyright_holder: Optional[str],
) -> Tuple[Optional[int], Optional[str], Optional[str]]:
"""
Determines the complete licensing metadata, given the current metadata, and comparing it
with what would make the node complete.

:param channel_id: The channel the node was sourced from
:param license_id: The current license_id of the node
:param license_description: The current license_description of the node
:param copyright_holder: The current copyright_holder of the node
:return: A tuple of (license_id, license_description, copyright_holder) to use on the node
"""
info = self._lookup.get(channel_id, None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

praise: Nice handling of the unknown-channel case — logging a warning and passing through the original values is exactly right for a data-fix command where you want visibility without crashing.

if info is None:
logger.warning(f"Failed to find licensing info for channel: {channel_id}")
return license_id, license_description, copyright_holder

if not license_id:
license_id = info["license_id"]

if not license_id:
return None, license_description, copyright_holder

license_obj = self._license_lookup.get(license_id)

if license_obj.is_custom and not license_description:
license_description = info["license_description"]

if license_obj.copyright_holder_required and not copyright_holder:
copyright_holder = info["copyright_holder"]

return license_id, license_description, copyright_holder


class Command(BaseCommand):
"""
Audits nodes that have imported content from public channels and whether the imported content
has a missing source node.

TODO: this does not yet FIX them
has a missing source node. We've determined that pretty much all of these have incomplete
licensing data
"""

def handle(self, *args, **options):
Expand Down Expand Up @@ -71,32 +149,27 @@ def handle(self, *args, **options):

logger.info("=== Iterating over private destination channels. ===")
channel_count = 0
total_node_count = 0

with open("fix_missing_import_sources.csv", "w", newline="") as csv_file:
csv_writer = csv.DictWriter(
csv_file,
fieldnames=[
"channel_id",
"channel_name",
"contentnode_id",
"contentnode_title",
"public_channel_id",
"public_channel_name",
"public_channel_deleted",
],
)
csv_writer.writeheader()
total_fixed = 0
lookup = LicensingFixesLookup()

command_dir = Path(__file__).parent
csv_path = command_dir / "licensing_fixes_lookup.csv"

with csv_path.open("r", encoding="utf-8", newline="") as csv_file:
lookup.load(csv_file)

for channel in destination_channels.iterator():
node_count = self.handle_channel(csv_writer, channel)
# skip using an iterator here, to limit transaction duration to `handle_channel`
for channel in destination_channels:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

praise: Good call not using .iterator() on the outer channel loop — keeps the long-lived transaction scoped to handle_channel instead of spanning the entire command.

node_count = self.handle_channel(lookup, channel)

if node_count > 0:
total_node_count += node_count
channel_count += 1
if node_count > 0:
total_fixed += node_count
channel_count += 1

logger.info("=== Done iterating over private destination channels. ===")
logger.info(f"Found {total_node_count} nodes across {channel_count} channels.")
logger.info(
f"Fixed incomplete licensing data on {total_fixed} nodes across {channel_count} channels."
)
logger.info(f"Finished in {time.time() - start}")

def get_public_cte(self) -> With:
Expand All @@ -110,7 +183,15 @@ def get_public_cte(self) -> With:
name="public_cte",
)

def handle_channel(self, csv_writer: csv.DictWriter, channel: dict) -> int:
def handle_channel(self, lookup: LicensingFixesLookup, channel: dict) -> int:
"""
Goes through the nodes of the channel, that were imported from public channels, but no
longer have a valid source node. For each node, it applies license metadata as necessary

:param lookup: The lookup utility to pull licensing data from
:param channel: The channel to fix
:return: The total node count that are now marked complete as a result of the fixes
"""
public_cte = self.get_public_cte()
channel_id = channel["id"]
channel_name = channel["name"]
Expand All @@ -136,29 +217,50 @@ def handle_channel(self, csv_writer: csv.DictWriter, channel: dict) -> int:
)
)
)
.values(
"public_channel_id",
"public_channel_name",
"public_channel_deleted",
contentnode_id=F("id"),
contentnode_title=F("title"),
)
)

# Count and log results
node_count = missing_source_nodes.count()
processed = 0
was_complete = 0
unfixed = 0
now_complete = 0

# TODO: this will be replaced with logic to correct the missing source nodes
if node_count > 0:
def _log():
logger.info(
f"{channel_id}:{channel_name}\t{node_count} node(s) with missing source nodes."
f"Fixing {channel_id}:{channel_name}\ttotal: {node_count}; before: {was_complete} unfixed: {unfixed}; after: {now_complete};"
)
row_dict = {
"channel_id": channel_id,
"channel_name": channel_name,
}
for node_dict in missing_source_nodes.iterator():
row_dict.update(node_dict)
csv_writer.writerow(row_dict)

return node_count

if node_count > 0:
for node in missing_source_nodes.iterator():
# determine the new license metadata
license_id, license_description, copyright_holder = lookup.get_info(
node.original_channel_id,
node.license_id,
node.license_description,
node.copyright_holder,
)

# if there isn't a license, there's nothing to do
if not license_id:
unfixed += 1
# cannot fix
continue

if node.complete:
was_complete += 1

# apply updates
node.license_id = license_id
node.license_description = license_description
node.copyright_holder = copyright_holder
if not node.mark_complete():
now_complete += 1
node.save()
processed += 1
if processed % 100 == 0:
_log()

_log()

return now_complete - was_complete
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
channel_id,channel_name,license_id,license_name,license_description,copyright_holder
f9d3e0e4-6ea2-5789-bbed-672ff6a399ed,African Storybook Library (multiple languages),,CC BY,"",African Storybook Initiative
d0ef6f71-e4fe-4e54-bb87-d7dab5eeaae2,Be Strong: Internet safety resources,,CC BY-NC-ND,"",Vodafone
2d7b056d-668a-58ee-9244-ccf76108cbdb,Book Dash,,CC BY,"",http://bookdash.org/
922e9c57-6c2f-59e5-9389-142b136308ff,Career Girls,,Special Permissions,For use on Kolibri,Career Girls
da53f90b-1be2-5752-a046-82bbc353659f,Ciencia NASA,,,,""
0294a064-f722-4899-887c-e07bd47f9991,Citoyennes de la Terre,,CC BY,"",Florence Piron
604ad3b8-5d84-4dd8-9ee7-0fa12a9a5a6e,CREE+,,CC BY-NC-SA,"","Publicado por el Lic. Edelberto Andino(edelberto.andino.ea@gmail.com) para ser utilizado con fines educativos únicamente, no debe ser utilizado con fines lucrativos de ninguna índole."
ef2ead65-de76-4ea4-a27b-ba6df5282c74,CSpathshala - सीएसपाठशाला (हिंदी),,CC BY,"",ए सि एम् इंडिया
7e68bc59-d430-4e71-8a07-50b1b87125ad,Cultura Emprendedora,,CC BY-NC-SA,"The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, and build upon your work non-commercially, as long as they credit you and license their new creations under the identical terms.",Junta de Andalucia
c51a0f84-2fed-427c-95ac-ff9bb4a21e3c,EENET Inclusive Education Training Materials,,CC BY-NC-SA,"",Enabling Education Network (EENET)
0e173fca-6e90-52f8-a474-a2fb84055faf,Global Digital Library - Book Catalog,,CC BY,"",Enabling Writers Initiative
624e09bb-5eeb-4d20-aa8d-e62e7b4778a0,How to get started with Kolibri,,CC BY-NC,"",Learning Equality
378cf412-8c85-4c27-95c1-00b5aca7a3ed,Inclusive Home Learning Activities,,CC BY,"The Attribution License lets others distribute, remix, tweak, and build upon your work, even commercially, as long as they credit you for the original creation. This is the most accommodating of licenses offered. Recommended for maximum dissemination and use of licensed materials.",EENET – Enabling Education Network
d76da4d3-6cfd-5927-9b57-5dfc6017aa13,Kamkalima (العربيّة),,CC BY-NC-ND,"",Kamkalima
2fd54ca4-7a8f-59c9-9fce-faaa3894c19e,Khan Academy (English - CBSE India Curriculum),,CC BY-NC-SA,"",Khan Academy
c9d7f950-ab6b-5a11-99e3-d6c10d7f0103,Khan Academy (English - US curriculum),,CC BY-NC-SA,"",Khan Academy
c1f2b7e6-ac9f-56a2-bb44-fa7a48b66dce,Khan Academy (Español),,CC BY-NC-SA,"",Khan Academy
878ec2e6-f88c-5c26-8b1b-e6f202833cd4,Khan Academy (Français),,CC BY-NC-SA,"",Khan Academy
801a5f02-9420-5569-8918-edcff6494185,Khan Academy (Italiano),,CC BY-NC-SA,"",Khan Academy
ec164fee-25ee-5262-96e6-8f7c10b1e169,Khan Academy (Kiswahili),,CC BY-NC-SA,"",Khan Academy
2ac071c4-6723-54f2-aa78-953448f81e50,Khan Academy (Português - Brasil),,CC BY-NC-SA,"",Khan Academy
c3231d84-4f8d-5bb1-b4cb-c6a7ddd91eb7,Khan Academy (Português (Portugal)),,CC BY-NC-SA,"",Khan Academy
09ee940e-1069-53a2-b671-6e1020a0ce3f,Khan Academy (български език),,CC BY-NC-SA,"",Khan Academy
a53592c9-72a8-594e-9b69-5aa127493ff6,Khan Academy (हिन्दी),,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
a03496a6-de09-5e7b-a9d2-4291a487c78d,Khan Academy (বাংলা),,CC BY-NC-SA,"",Khan Academy
5357e525-81c3-567d-a4f5-6d56badfeac7,Khan Academy (ગુજરાતી),,CC BY-NC-SA,"",Khan Academy
2b608c6f-d4c3-5c34-b738-7e3dd7b53265,Khan Academy (ဗမာစာ),,CC BY-NC-SA,"",Khan Academy
f5b71417-b1f6-57fc-a4d1-aaecd23e4067,Khan Academy (ភាសាខ្មែរ),,Special Permissions,Permission granted to distribute through Kolibri for non-commercial use,Khan Academy
ec599e77-f9ad-5802-8975-e8a26e6f1821,Khan Academy (中文(中国)),,CC BY-NC-SA,"",Khan Academy
913efe9f-14c6-5cb1-b234-02f21f056e99,MIT Blossoms,,CC BY-NC-SA,"",MIT Blossoms
fc47aee8-2e01-53e2-a301-97d3fdee1128,Open Stax,,CC BY,"The Attribution License lets others distribute, remix, tweak, and build upon your work, even commercially, as long as they credit you for the original creation. This is the most accommodating of licenses offered. Recommended for maximum dissemination and use of licensed materials.",Rice University
b8bd7770-063d-40a8-bd9b-30d4703927b5,PBS SoCal: Family Math,,All Rights Reserved,"",PBS SoCal
197934f1-4430-5350-b582-0c7c4dd8e194,PhET Interactive Simulations,,CC BY,"","PhET Interactive Simulations, University of Colorado Boulder"
aa254505-59b5-5bd7-9bc9-0c09dfb805d2,PhET simulações interativas,,CC BY,"","PhET Interactive Simulations, University of Colorado Boulder"
889f0c34-b275-507a-b8d3-7d2da2d03aa9,PhET – інтерактивне моделювання,,CC BY,"","PhET Interactive Simulations, University of Colorado Boulder"
f6cb302e-f659-4db4-b4a0-4b4991a595c2,Plan Educativo TIC Basico,,CC BY-NC-SA,"The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, and build upon your work non-commercially, as long as they credit you and license their new creations under the identical terms.",Junta de Andalucia
e832106c-6398-54e1-8161-6015a8b87910,PraDigi,,CC BY-NC-SA,"The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, and build upon your work non-commercially, as long as they credit you and license their new creations under the identical terms.",PraDigi
131e543d-becf-5776-bb13-cfcfddf05605,Pratham Books' StoryWeaver,,CC BY,"The Attribution License lets others distribute, remix, tweak, and build upon your work, even commercially, as long as they credit you for the original creation. This is the most accommodating of licenses offered. Recommended for maximum dissemination and use of licensed materials.",Pratham Books
f758ac6a-d39c-452f-9566-58da6ad7d3cc,Project Based Learning with Kolibri,,,"",""
305b12ea-5ea8-4fa1-8f93-3705c23f5ee0,School of Thought,,CC BY,"",School of Thought
3e464ee1-2f6a-50a7-81cd-df59147b48b1,Sikana (English),,CC BY-NC-ND,"",Sikana Education
30c71c99-c42c-57d1-81e8-aeafd2e15e5f,Sikana (Español),,CC BY-NC-ND,"The Attribution-NonCommercial-NoDerivs License is the most restrictive of our six main licenses, only allowing others to download your works and share them with others as long as they credit you, but they can't change them in any way or use them commercially.",Sikana Education
8ef625db-6e86-506c-9a3b-ac891e413fff,Sikana (Français),,CC BY-NC-ND,"The Attribution-NonCommercial-NoDerivs License is the most restrictive of our six main licenses, only allowing others to download your works and share them with others as long as they credit you, but they can't change them in any way or use them commercially.",Sikana Education
f4715a77-6972-5c72-9d25-d29977b8b308,Similasyon Enteraktif PhET,,CC BY,"","PhET Interactive Simulations, University of Colorado Boulder"
8fa678af-1dd0-5329-bf32-18c549b84996,Simulaciones interactivas PhET,,CC BY,"","PhET Interactive Simulations, University of Colorado Boulder"
a9b25ac9-8147-42c8-83ce-1b0579448337,TESSA - Teacher Resources,,CC BY-NC-SA,"",Open University
74f36493-bb47-5b62-935f-a8705ed59fed,Thoughtful Learning,,CC BY-NC-SA,"The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, and build upon your work non-commercially, as long as they credit you and license their new creations under the identical terms.",Thoughtful Learning
000409f8-1dbe-5d1b-a671-01cb9fed4530,Touchable Earth (en),,Special Permissions,Permission has been granted by Touchable Earth to distribute this content through Kolibri.,Touchable Earth Foundation (New Zealand)
b336c2e2-c45c-53d5-b24e-5c476a54b077,Touchable Earth (fr),,Special Permissions,Permission has been granted by Touchable Earth to distribute this content through Kolibri.,Touchable Earth Foundation (New Zealand)
08a53136-a155-5f64-b049-6b3e1318b0cd,Ubongo Kids,,CC BY-NC-ND,"The Attribution-NonCommercial-NoDerivs License is the most restrictive of our six main licenses, only allowing others to download your works and share them with others as long as they credit you, but they can't change them in any way or use them commercially.",Ubongo Media
237e5975-bce2-5bf6-aff3-98f4c17516f3,,,,,
Loading
Loading