Created
October 22, 2019 09:56
-
-
Save camallen/0279a30a414902eeaa0ef6b5321c3ae4 to your computer and use it in GitHub Desktop.
Broken survey task labels on AmazonCam Tambopata project (invalid translation strings data)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1. The underlying cause of the issue | |
# ~17:00-17:30 on Friday 18th October, 2019 | |
# the commands I used to work on the translation fix issue | |
# fix the workflow strings for an older survey task (missing descriptions) | |
# https://www.zooniverse.org/lab/3040/workflows/2485 | |
workflow = Workflow.find 2485 | |
tasks = workflow.tasks | |
task_string_extractor = TasksVisitors::ExtractStrings.new | |
# I believe I resued the workflow tasks variable here and mangled the workflow translation strings | |
# as workflow.tasks variable had already been run through the `TasksVisitors::ExtractStrings` | |
# instead I should have reloaded the workflow object to ensure the tasks were fresh and untouched. | |
# Note: always take care when working on a rails console | |
# best practice is to use a test project to vet your changes before running on production / live projects | |
# this modifies tasks in place instead of returning a new task object. | |
task_string_extractor.visit(tasks) | |
stripped_tasks = tasks | |
strings = task_string_extractor.collector | |
workflow.strings = strings | |
worfklow.save | |
# I believe reusing the workflow variable here is where i mangled the translation strings | |
# and thus added the broken labels to the english language translation record | |
# instead I should have reloaded the workflow object to ensure the tasks were fresh and untouched. | |
# now update the translation strings | |
translated_strings = TranslationStrings.new(workflow).extract | |
translation = Translation.find_or_initialize_by( | |
translated: workflow, | |
language: workflow.primary_language | |
) | |
translation.update_strings_and_versions(translated_strings, workflow.latest_version_id) | |
translation.save! | |
# testing what the injected strings look like after loading from a fresh representation of the workflow | |
t_workflow = Workflow.find 2485 | |
hmm = TasksVisitors::InjectStrings.new(t_workflow.strings).visit(t_workflow.tasks) | |
t_workflow.tasks | |
# 2. The fixes for the issue | |
# ~15:30-19:00 on Sunday 20th October, 2019 | |
# the commands I used to fix the broken workflow strings from a restored backup | |
# and to fix the translation issues | |
workflow = Workflow.where(id: 2485).first | |
good_workflow_data = nil | |
bad_workflow_data = nil | |
# get the bad strings (host == panoptes main db) | |
# CSV.open("tmp/workflow_2485_bad_strings.csv", "wb") do |csv| | |
# csv << ["id", "tasks", "strings"] | |
# csv << [workflow.id, workflow.tasks.to_json, workflow.strings.to_json] | |
# end | |
# I exported a file to share the state of the bad strings for testing | |
bad_workflow_data = CSV.read("tmp/workflow_2485_bad_strings.csv") | |
# #bork them again first | |
borked_tasks = JSON.parse(bad_workflow_data[1][1]) | |
borked_strings = JSON.parse(bad_workflow_data[1][2]) | |
# --------------------------------------------------------------------------------------------- | |
# now get the good strings (change the host to the restored backup) | |
# CSV.open("tmp/workflow_2485_good_strings.csv", "wb") do |csv| | |
# csv << ["id", "tasks", "strings"] | |
# csv << [workflow.id, workflow.tasks.to_json, workflow.strings.to_json] | |
# end | |
# ge them good stings from the saved file | |
good_workflow_data = CSV.read("tmp/workflow_2485_good_strings.csv") | |
good_tasks = JSON.parse(good_workflow_data[1][1]) | |
good_strings = JSON.parse(good_workflow_data[1][2]) | |
# function to inspect difference in the nested data | |
def hash_diff(first, second) | |
first. | |
dup. | |
delete_if { |k, v| second[k] == v }. | |
merge!(second.dup.delete_if { |k, v| first.has_key?(k) }) | |
end | |
# have a look at them to see what they look like | |
good_tasks["T0"]['choices']['TAYRA'] | |
borked_tasks["T0"]['choices']['TAYRA'] | |
borked_strings.keys.select { |k| k.include?('TAYRA') } | |
borked_strings.each.select { |k,v| k.include?('TAYRA') } | |
good_strings.each.select { |k,v| k.include?('TAYRA') } | |
good_tasks["T0"].keys.map do |k| | |
if good_tasks["T0"][k].is_a?(Hash) | |
hash_diff(good_tasks["T0"][k], borked_tasks["T0"][k]) | |
end | |
end | |
# attempt to restore the good string data to the broken workflow configs | |
# 1st combine the good strings and bad strings back together | |
# so that we can properly extract them to get the 'proper' (not only label) representations of the | |
# extracted strings, e.g the old data was extracted withouth descriptions etc | |
tasks_to_inject = good_tasks.dup | |
strings_to_inject = good_strings.dup | |
TasksVisitors::InjectStrings.new(strings_to_inject).visit(tasks_to_inject) | |
# get the strings / tasks from the injected data | |
task_string_extractor = TasksVisitors::ExtractStrings.new | |
task_string_extractor.visit(tasks_to_inject) | |
exracted_tasks = tasks_to_inject | |
extracted_strings = task_string_extractor.collector | |
# now update the workflow column data (avoid callbacks and version histories, we are replacing them manually) | |
workflow.update_column(:strings, extracted_strings) | |
workflow.update_column(:tasks, exracted_tasks) | |
# find and fix the borked workflow version tasks & strings | |
workflow.workflow_version_ids.sort | |
# "6394416,6394417,6394418,6394419,6394420,6394421,6394422,6394423,6394424,6394425,6394426,6394427,6394428,6394429,6394430,6394431,6394432,6394433,6394434,6394435,6394436,6394437,6394438,6394439,6394440,6394441,6498095" | |
# manually check the last two (highest ids) - 6394441,6498095 | |
# only the last one is broken after looking it (2nd last made in Jan when migrating to new version system) | |
wv = WorkflowVersion.find(6498095) | |
wv.update_column(:strings, extracted_strings) | |
wv.update_column(:tasks, exracted_tasks) | |
# ok - now fix the synced primary translations | |
workflow = Workflow.where(id: 2485).first | |
language = workflow.primary_language | |
translation = Translation.find_or_initialize_by(translated: workflow,language: language) | |
translated_strings = TranslationStrings.new(workflow).extract | |
translation.update_strings_and_versions(translated_strings, workflow.latest_version_id) | |
# check the translation record strings are correct now | |
translation.strings | |
# check the translation string_versions look good as well (refer to the correct workflow_version record we found above) | |
translation.string_versions | |
# do not save, it'll create a new translation version, instead just update this one | |
translation.update_column(:strings, translation.strings) | |
# now update the translation version we previous borked | |
translation = Translation.find_or_initialize_by(translated: workflow,language: language) | |
translation.translation_version_ids.sort | |
tv = TranslationVersion.find 118625 | |
tv.strings = translation.strings | |
tv.string_versions = translation.string_versions | |
# check the changes | |
tv.changes.keys #(only strings) | |
# update the strings so this vesion is correct | |
tv.update_column(:strings, tv.strings) | |
# update the busted classifications | |
incident_data = DateTime.now.utc - 3.days - 1.hour - 30.minutes | |
borked_cs = Classification.where(project_id: project.id).where("created_at > ?", incident_date).where(workflow_version: '587.31') | |
borked_cs.map { |cs| cs.update_column(:metadata, cs.metadata.merge('invalid_translation' => 'original lables not showing correctly')) } | |
# emailed the researchers Sunday evening, 20th October ~18:45 | |
# emailed the project translator Sunday evening, 20th October ~18:55 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment