diff --git a/nel-emerson/scripts/el_recipe.py b/nel-emerson/scripts/el_recipe.py index 7d7b5f2a0..2a2aaa4c6 100644 --- a/nel-emerson/scripts/el_recipe.py +++ b/nel-emerson/scripts/el_recipe.py @@ -45,7 +45,7 @@ def entity_linker_manual(dataset, source, nlp_dir, kb_loc, entity_loc): # For each NER mention, add the candidates from the KB to the annotation task stream = _add_options(stream, kb, id_dict) - stream = filter_duplicates(stream, by_input=True, by_task=False) + stream = filter_duplicates(stream, by_input=False, by_task=True) return { "dataset": dataset, diff --git a/nel-emerson/scripts/notebook_video.ipynb b/nel-emerson/scripts/notebook_video.ipynb index 3ce1a0a1f..90f44915e 100644 --- a/nel-emerson/scripts/notebook_video.ipynb +++ b/nel-emerson/scripts/notebook_video.ipynb @@ -314,7 +314,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "If you are watching [the video](https://www.youtube.com/watch?v=8u57WSXVpmw), it will explain how to obtain annotated data with Prodigy. The final result will be a JSONL file that is distributed alongside this notebook. We'll now use this JSONL file to train our entity linker. If you want to skip the annotation part in the video, you can fast forward to [this secion](https://www.youtube.com/watch?v=8u57WSXVpmw&t=19m19s)." + "If you are watching [the video](https://www.youtube.com/watch?v=8u57WSXVpmw), it will explain how to obtain annotated data with Prodigy. The final result will be a JSONL file that is distributed alongside this notebook. We'll now use this JSONL file to train our entity linker. If you want to skip the annotation part in the video, you can fast forward to [this section](https://www.youtube.com/watch?v=8u57WSXVpmw&t=19m19s)." ] }, { diff --git a/nel-wikipedia/README.md b/nel-wikipedia/README.md index 4e628a19f..a1e089267 100644 --- a/nel-wikipedia/README.md +++ b/nel-wikipedia/README.md @@ -3,6 +3,8 @@ These scripts are provided on an as-is basis. Questions around usage of these scripts are best asked at StackOverflow. Note that processing the whole of Wikipedia will require a lot of resources and can be overkill for your specific use-case. +Note: this project was created for spaCy v2.x. If you want to run this functionality with spaCy v3.x, check out [wikid](https://github.com/explosion/wikid) and https://github.com/explosion/projects/tree/v3/benchmarks/nel. + ### Step 1: Create a Knowledge Base (KB) and training data Run `wikidata_pretrain_kb.py`