|
12 | 12 | import warnings
|
13 | 13 | import contextlib
|
14 | 14 | import multiprocessing
|
| 15 | +from sentence_transformers import CrossEncoder |
| 16 | +from pathlib import Path |
| 17 | +from nltk import sent_tokenize |
| 18 | +import nltk |
| 19 | +import os |
15 | 20 |
|
16 | 21 | from typing import (
|
17 | 22 | Any,
|
|
51 | 56 | from ._logger import set_verbose
|
52 | 57 | from ._utils import suppress_stdout_stderr
|
53 | 58 |
|
| 59 | +# LlamaX class that takes a string as input and returns that string |
| 60 | +class LlamaX: |
| 61 | + def __init__(self, model_path: str): |
| 62 | + self.model_path = model_path |
| 63 | + |
| 64 | + def get_model_path(self) -> str: |
| 65 | + return self.model_path |
| 66 | + |
| 67 | +# nltk dataloader. requires nltk to installed |
| 68 | +def nlLoader(nltkData): |
| 69 | + nltk_data_dir = Path(nltkData) |
| 70 | + nltk.data.path.append(str(nltk_data_dir)) |
| 71 | + |
| 72 | +# Sentence-splitter function |
| 73 | +def sentSplit(): |
| 74 | + |
| 75 | + username = os.getenv('USERNAME') |
| 76 | + path = f'C:\\Users\\{username}\\Documents\\Data' |
| 77 | + if os.path.exists(path) == False: |
| 78 | + print('Creating new directory called `Data` in your Documents directory. Please ensure that you add your RAG ingested data to the directory as a .txt file. Thank you!') |
| 79 | + os.mkdir(f'C:\\Users\\{username}\\Documents\\Data') |
| 80 | + |
| 81 | + context_dir = path #"context_dir" |
| 82 | + for filename in os.listdir(context_dir): |
| 83 | + file_path = os.path.join(context_dir, filename) |
| 84 | + if filename.endswith('.txt'): |
| 85 | + with open(file_path, encoding='utf-8') as file: |
| 86 | + document = file.read() |
| 87 | + |
| 88 | + ## We split this article into paragraphs and then every paragraph into sentences |
| 89 | + paragraphs = [] |
| 90 | + try: |
| 91 | + for paragraph in document.replace("\r\n", "\n").split("\n\n"): |
| 92 | + if len(paragraph.strip()) > 0: |
| 93 | + paragraphs.append(sent_tokenize(paragraph.strip())) |
| 94 | + return paragraphs |
| 95 | + except Exception as e: |
| 96 | + print(''' |
| 97 | + IMPORTANT NOTICE: |
| 98 | + Please add your text dataset to the Data directory. Before continuing. Thank you!''') |
| 99 | + print(e) |
| 100 | + |
| 101 | +# Paragraph search function. Window-size may be adjusted |
| 102 | +def passSearch(): |
| 103 | + window_size = 3 |
| 104 | + passages = [] |
| 105 | + for paragraph in sentSplit(): |
| 106 | + for start_idx in range(0, len(paragraph), window_size): |
| 107 | + end_idx = min(start_idx + window_size, len(paragraph)) |
| 108 | + passages.append(" ".join(paragraph[start_idx:end_idx])) |
| 109 | + return passages |
| 110 | + |
| 111 | +# Search in a loop for individual queries and predict the scores for the [query, passage] pairs |
| 112 | +def searchQuery(question, model_path: str): |
| 113 | + query = [] |
| 114 | + query.append(question) |
| 115 | + docs = [] |
| 116 | + |
| 117 | + for que in query: |
| 118 | + try: |
| 119 | + model = model_path |
| 120 | + # Concatenate the query and all passages and predict the scores for the pairs [query, passage] |
| 121 | + model_inputs = [[que, passage] for passage in passSearch()] |
| 122 | + scores = model.predict(model_inputs) |
| 123 | + |
| 124 | + # Sort the scores in decreasing order |
| 125 | + results = [{"input": inp, "score": score} for inp, score in zip(model_inputs, scores)] |
| 126 | + results = sorted(results, key=lambda x: x["score"], reverse=True) |
| 127 | + for hit in results[0:1]: |
| 128 | + print("") |
| 129 | + docs.append(hit["input"][1]) |
| 130 | + print("Performing in-document search...") |
| 131 | + print("") |
| 132 | + return docs |
| 133 | + except Exception as e: |
| 134 | + print("Oh no!, It seems that you have not added you text dataset to the Data directory.") |
| 135 | + print(e) |
| 136 | + |
54 | 137 |
|
55 | 138 | class Llama:
|
56 | 139 | """High-level Python wrapper for a llama.cpp model."""
|
|
0 commit comments