Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

train_new_entity_type.py is inter-mixing doc and entity_offset data with BILOU format training data

See original GitHub issue

Bug in the train new entity type

I am trying add new entity named technology and i have annotated data in BILOU format to train with the help of train new entity type.

Originial example is trained with entity offset format but this can also be trained in BILOU format as explained.

My Training Data in BILOU format is same as given below ` train_data = [

    ('Play a key role in developing high - performance , scalable and fault - tolerant applications written in Java ',
    ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'U-technology']),

    ('Focus around building custom APIs for a variety of connected devises ',
     ['O', 'O', 'O', 'O', 'U-technology', 'O', 'O', 'O', 'O', 'O', 'O'])
]`

Complete Training Script with BILOU format

from __future__ import unicode_literals, print_function
import spacy
from spacy.gold import GoldParse
import random
from pathlib import Path


def train_ner(nlp, train_data, output_dir):
    # Add new words to vocab
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]
        print(doc,len(doc))

    for itn in range(20):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            print(doc, "token length", len(doc), "length of entity", len(entity_offsets),id(doc))
            print(entity_offsets)
            # doc = nlp.make_doc(raw_text)  #position of doc object assignment is
            gold = GoldParse(doc, entities=entity_offsets) #loc is refefrenced before assignment
            doc = nlp.make_doc(raw_text)
            nlp.tagger(doc)
            loss = nlp.entity.update(doc, gold)
    nlp.end_training()
    if output_dir:
        nlp.save_to_directory(output_dir)



def main(model_name, output_directory=None):
    nlp = spacy.load(model_name)
    if output_directory is not None:
        output_directory = Path(output_directory)
    #
    train_data = [

        ('Play a key role in developing high - performance , scalable and fault - tolerant applications written in Java ',
        ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'U-technology']),

        ('Focus around building custom APIs for a variety of connected devises ',
         ['O', 'O', 'O', 'O', 'U-technology', 'O', 'O', 'O', 'O', 'O', 'O'])
    ]
    nlp.entity.add_label('technology')
    ner = train_ner(nlp, train_data, output_directory)

if __name__ == '__main__':

    output_directory = "example_model"
    main('en',output_directory)

The above example is throwing error which is given below

gold = GoldParse(doc, entities=entity_offsets) #loc is refefrenced before assignment
File "spacy/gold.pyx", line 294, in spacy.gold.GoldParse.__init__ (spacy/gold.cpp:10834)
IndexError: list index out of range

Above issue could be fixed by changing the position of ‘doc = nlp.make_doc(raw_text)’ to declare before gold parse object which is currently declared after gold parse object and this creates issue of Local variable might be referenced before assignment

Complete Training Script with entity offset format

The below example is working fine when training data is entity offset format.

from __future__ import unicode_literals, print_function
import spacy
from spacy.gold import GoldParse
import random
from pathlib import Path


def train_ner(nlp, train_data, output_dir):
    # Add new words to vocab
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]
        print(doc,len(doc))

    for itn in range(20):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            print(doc, "token length", len(doc), "length of entity", len(entity_offsets),id(doc))
            print(entity_offsets)
            # doc = nlp.make_doc(raw_text)  #position of doc object assignment is
            gold = GoldParse(doc, entities=entity_offsets) #loc is refefrenced before assignment
            doc = nlp.make_doc(raw_text)
            nlp.tagger(doc)
            loss = nlp.entity.update(doc, gold)
    nlp.end_training()
    if output_dir:
        nlp.save_to_directory(output_dir)



def main(model_name, output_directory=None):
    nlp = spacy.load(model_name)
    if output_directory is not None:
        output_directory = Path(output_directory)

    train_data = [
         (
         'Play a key role in developing high - performance , scalable and fault - tolerant applications written in Java ',
         [(105,109,'technology')]),
    
         ('Focus around building custom APIs for a variety of connected devises ',
          [(29,33,'technology')])
     ]

    nlp.entity.add_label('technology')
    ner = train_ner(nlp, train_data, output_directory)


if __name__ == '__main__':

    output_directory = "example_model"
    main('en',output_directory)