instructlab · anik120 · Apr 4, 2024 · Apr 4, 2024
diff --git a/cli/generator/generate_data.py b/cli/generator/generate_data.py
@@ -20,7 +20,6 @@
 import yaml

 # Local
-from .. import config
 from ..utils import chunk_document, get_documents, get_taxonomy_diff
 from . import utils

@@ -339,6 +338,7 @@ def generate_data(
    console_output=True,
    api_key: Optional[str] = None,
    chunk_word_count=None,
+    server_ctx_size=None,
 ):
    seed_instruction_data = []
    generate_start = time.time()
@@ -364,20 +364,17 @@ def generate_data(
    def unescape(s):
        return bytes(s, "utf-8").decode("utf-8")

-    placeholder = seed_instruction_data[0]["document"]
-    if placeholder:
-        documents = chunk_document(
-            documents=placeholder,
-            max_context_size=config.MAX_CONTEXT_SIZE,
-            chunk_word_count=chunk_word_count,
-        )
-
    test_data = []
    for seed_example in seed_instruction_data:
        user = seed_example["instruction"]

-        if placeholder:
-            seed_example["document"] = documents
+        documents = seed_example["document"]
+        if documents:
+            seed_example["document"] = chunk_document(
+                documents=documents,
+                server_ctx_size=server_ctx_size,
+                chunk_word_count=chunk_word_count,
+            )

        if len(seed_example["input"]) > 0:
            user += "\n" + seed_example["input"]
@@ -597,12 +594,6 @@ def read_taxonomy_file(logger, file_path, yaml_rules: Optional[str] = None):
                documents = get_documents(documents)
                logger.info("Content from git repo fetched")

-                # cfg = config.get_default_config()
-                # documents = chunk_document(
-                #     documents=documents,
-                #     max_context_size=cfg.serve.max_ctx_size,
-                #     chunk_word_count=chunk_word_count,
-                # )
            for t in get_seed_examples(contents):
                q = t["question"]
                a = t["answer"]

diff --git a/cli/lab.py b/cli/lab.py
@@ -388,6 +388,13 @@ def serve(ctx, model_path, gpu_layers, num_threads, max_ctx_size):
    show_default=True,
    help="Rules file for YAML linting",
 )
+@click.option(
+    "--server-ctx-size",
+    type=click.INT,
+    default=config.MAX_CONTEXT_SIZE,
+    show_default=True,
+    help="The context size is the maximum number of tokens the server will consider.",
+)
 @click.pass_context
 def generate(
    ctx,
@@ -403,6 +410,7 @@ def generate(
    api_key,
    yaml_rules,
    chunk_word_count,
+    server_ctx_size,
 ):
    """Generates synthetic data to enhance your example data"""
    # pylint: disable=C0415
@@ -450,6 +458,7 @@ def generate(
            console_output=not quiet,
            yaml_rules=yaml_rules,
            chunk_word_count=chunk_word_count,
+            server_ctx_size=server_ctx_size,
        )
    except GenerateException as exc:
        click.secho(

diff --git a/cli/utils.py b/cli/utils.py
@@ -216,33 +216,35 @@ def get_documents(input_pattern: Dict[str, Union[str, List[str]]]) -> List[str]:
            shutil.rmtree(temp_dir)


-def chunk_document(documents: List, max_context_size, chunk_word_count) -> List[str]:
+def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[str]:
    """
    Iterates over the documents and splits them into chunks based on the word count provided by the user.
    Args:
-        document (dict): List of documents retrieved from git (can also consist of a single document)
-        max_context_size (int): Defaults to 4096
+        documents (dict): List of documents retrieved from git (can also consist of a single document).
+        server_ctx_size (int): Context window size of server.
        chunk_word_count (int): Maximum number of words to chunk a document.
    Returns:
         List[str]: List of chunked documents.
    """
-    token_size = int(chunk_word_count * 1.3)  # 1 word ~ 1.3 token
-    content = []
-    if token_size < int(max_context_size - 1024):
-        text_splitter = RecursiveCharacterTextSplitter(
-            separators=["\n\n", "\n"],
-            chunk_size=int(token_size * 4),  # 1 token ~ 4 English character
-            chunk_overlap=100,
-        )
-
-        for docs in documents:
-            temp = text_splitter.create_documents([docs])
-            content.extend([item.page_content for item in temp])
-
-    else:
+    no_tokens_per_doc = int(chunk_word_count * 1.3)  # 1 word ~ 1.3 token
+    if no_tokens_per_doc > int(server_ctx_size - 1024):
        logger.error(
-            "Error: Given word count exceeds the required chunk limit i.e. 2400"
+            "Error: {}".format(
+                str(
+                    f"Given word count per doc will exceed the server context window size {server_ctx_size}"
+                )
+            )
        )
        sys.exit()
+    content = []
+    text_splitter = RecursiveCharacterTextSplitter(
+        separators=["\n\n", "\n"],
+        chunk_size=int(no_tokens_per_doc * 4),  # 1 token ~ 4 English character
+        chunk_overlap=100,
+    )
+
+    for docs in documents:
+        temp = text_splitter.create_documents([docs])
+        content.extend([item.page_content for item in temp])

    return content
diff --git a/tests/test_lab_generate.py b/tests/test_lab_generate.py
@@ -179,6 +179,7 @@ def test_OpenAI_server_error(self, get_instructions_from_model):
                    rouge_threshold=0.9,
                    console_output=True,
                    chunk_word_count=1000,
+                    server_ctx_size=4096,
                )
            self.assertIn(
                "There was a problem connecting to the OpenAI server",
@@ -210,6 +211,7 @@ def test_no_error(self, get_instructions_from_model):
                rouge_threshold=0.9,
                console_output=True,
                chunk_word_count=1000,
+                server_ctx_size=4096,
            )
            get_instructions_from_model.assert_called_once()
            expected_files = [