diff --git a/.gitignore b/.gitignore index 6294371..c08061c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ .idea __pycache__ - +.DS_Store diff --git a/01_basic/README.md b/01_basic/README.md index ffd211b..54419b6 100644 --- a/01_basic/README.md +++ b/01_basic/README.md @@ -80,7 +80,32 @@ [**40. 字典转成对象(骚操作)**](#字典转成对象) +[**41. lgb[gpu版本]和xgb[gpu版本]安装**](#boost安装) + +[**42. tqdm**](#tqdm) + +[**43. joblib Parallel并行**](#joblib_parallel) + +[**44. 调试神器pysnooper - 丢弃print**](#调试神器pysnooper) + +[**45. 调试神器debugpy**](#调试神器debugpy) + +[**46. 分组计算均值并填充**](#分组计算均值并填充) + +[**47. python日期处理**](#python日期处理) + +[**48. dataclass**](#dataclass) + +[**49. md5 sha256**](#md5_sha256) + +[**50. 查看内存**](#查看内存) + +[**51. __slots__用法**](#slots用法) + --- +
+点击展开 + ```python %reload_ext autoreload %autoreload 2 @@ -321,6 +346,10 @@ data[data.msg_from.apply(len)==10] ``` ### re模块 + +[常用正则表达式速查手册,Python文本处理必备](https://mp.weixin.qq.com/s/ySsgcrSnkguO2c8D-SQNxw)
+[regexlearn](https://github.com/aykutkardas/regexlearn.com)
+ ```python # 1. 将一个问题中的网址、邮箱、手机号、身份证、日期、价格提出来 @@ -370,7 +399,104 @@ s = '22基本日常生活活动:指食物摄取、大小便始末、穿脱衣 re.findall(r'^\d+(?![\d*小时*]|[\d*种*])[\u4e00-\u9fa5]+', s) # 匹配只留下中文、英文和数字 -re.sub(r'[^\u4E00-\u9FA5\s0-9a-zA-Z]+', '', s)- +re.sub(r'[^\u4E00-\u9FA5\s0-9a-zA-Z]+', '', s) + +# 日期解析202206 +import cn2an #version 0.5.14 +import datetime +import re +def getYearMonth(s): + ''' + 【格式说明】 + 今年上个月/上月/前一个月/前个月 -> 202204 + 今年当月/该月/这月/这个月/本月 -> 202205 + 去年5月/去年五月/2021年五月/2021五月/二零二一五月/二零二一 五月 -> 202105 + 前年5月/前年五月/2020年五月/2020五月/二零二零五月/二零二零 五月 -> 202005 + 2021年7月/二零二一年7月 -> 202107 + 5月/五月份 -> 202205 + 2021.6/2021.06/2021-6/2021-06/2021 - 6月/2021 ---6月/2021 . 6月/2021...6月, -> 202106 + 2021 4月/2021 04 -> 202104 + 如果没有提到时间 -> 202205(默认今年当月) + 如果输入的时间有误或月份有误比如输入2021 23, -> 202205(默认今年当月) + 如果输入时间超过当前时间 -> 202205(默认今年当月) + 如果输入时间早于2020年1月 -> 202205(默认今年当月) + ''' + cur_date = datetime.datetime.now().strftime('%Y%m') + try: + DATE_REG1 = '(?:[一二三四五六七八九零十0-9]{1,4}年[一二三四五六七八九零十0-9]{1,2}月)|(?:去年[一二三四五六七八九零十0-9]+月)|(?:前年[一二三四五六七八九零十0-9]+月)|(?:[一二三四五六七八九零十0-9]+年[一二三四五六七八九零十0-9]+月)|(?:[一二三四五六七八九零十0-9]{1,2}月)|(?:[一二三四五六七八九零十0-9]+年)|(?:[一二三四五六七八九零十0-9]+月)' + thism_lst = ['当月', '该月', '这个月', '本月'] + lastm_lst = ['上月', '上个月', '前一个月', '前个月'] + date = '' + def helper(s, pattern): + date = '' + s = cn2an.transform(s, "cn2an") # 转换成阿拉伯数字 + res = re.findall(pattern, s) + if res: + res = res[0] # 如果有多个就取第一个 + year = '2022' #需要人工维护当年,还有过去两年的一个判断;每年要手动更新这部分 + if '去年' in res or '21年' in res: + year = '2021' + elif '前年' in res or '20年' in res: + year = '2020' + month = re.findall('(?:([0-9]+)月)', res) + if month: + month = int(month[0]) + if month > 0 and month < 13: + if month < 10: + month = '0' + str(month) + else: + month = str(month) + else: + return '' + date = year + month + else: + date = year + str(datetime.datetime.now().month) + return date + six_d = re.findall(r'2\d{5}', s) #直接识别6位日期比如202110 + if six_d: + date = six_d[0] + if not date: + # 针对2021 4月/2021.6/2021.06/2021-6/2021-06/2021 - 6月/2021 ---6月/2021 . 6月/2021...6月这些情况 + DATE_REG3 = r'(?:\d{4}\s*\.+\s*\d{1,2})|(?:\d{4}\s*-+\s*\d{1,2})|(?:\d{4}\s*_+\s*\d{1,2})|(?:\d{4}\s+\d{1,2})' + six_d2 = re.findall(DATE_REG3, s) + if six_d2: + _six_d2 = six_d2[0] + try: + int(_six_d2[-2]) + _six_d2_m = _six_d2[-2:] + except: + _six_d2_m = _six_d2[-1] + s = _six_d2[:4]+'年'+_six_d2_m+'月' + s = s.replace(' ', '') + if not date: + for i in thism_lst: + if i in s: + date = cur_date + break + if not date: + for i in lastm_lst: + if i in s: + date = (datetime.datetime.now() - datetime.timedelta(days=30, hours=23)).strftime('%Y%m') + break + if not date: + # 判断2021五月这种情况 + DATE_REG2 = '(?:[一二三四五六七八九零十0-9]{4}[一二三四五六七八九零十]{1,2}月)' + res = re.findall(DATE_REG2, s) + if res: + s = res[0][:4]+'年'+res[0][4:] + date = helper(s, DATE_REG1) + else: + date = '' + if not date: + date = helper(s, DATE_REG1) + if not date: + date = cur_date + #corner case再判断下,处理下边界问题 + if date < '202001' or date[-2:] > '12': + date = cur_date + except: + date = cur_date + return date ``` ### eval @@ -444,6 +570,81 @@ tasks.append(executor.submit(func2, param1, param2)) wait(tasks, return_when='ALL_COMPLETED') res1, res2 = (x.result() for x in tasks) ``` +```python +# 多进程优化版(推荐用这个) +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import functools +from concurrent.futures import ProcessPoolExecutor +from tqdm import tqdm +import time + +class Pipe(object): + """I am very like a linux pipe""" + + def __init__(self, function): + self.function = function + functools.update_wrapper(self, function) + + def __ror__(self, other): + return self.function(other) + + def __call__(self, *args, **kwargs): + return Pipe( + lambda iterable, *args2, **kwargs2: self.function( + iterable, *args, *args2, **kwargs, **kwargs2 + ) + ) + +@Pipe +def xProcessPoolExecutor(iterable, func, max_workers=5, desc="Processing", unit="it"): + if max_workers > 1: + total = len(iterable) if hasattr(iterable, '__len__') else None + + with ProcessPoolExecutor(max_workers) as pool, tqdm(total=total, desc=desc, unit=unit) as pbar: + for i in pool.map(func, iterable): + yield i + pbar.update() + + else: + return map(func, iterable) + +xtuple, xlist, xset = Pipe(tuple), Pipe(list), Pipe(set) + +def ff(x): + for i in range(x): + a = 1 + return x+2 + +if __name__ == '__main__': + dfs = [] + arr = [100000000,200000000,300000000,400000000] + #without multiprocess + for i in arr: + dfs.append(ff(i)) + #with multiprocess + dfs = arr | xProcessPoolExecutor(ff, 16) | xlist #这里的16是进程数,一般cpu有N核就起N-1个进程 + print(dfs) +``` +```python +# 多进程(yuanjie封装meutils) 以多进程读取data下pdf文件为例 +from meutils.pipe import * +os.environ['LOG_PATH'] = 'pdf.log' +from meutils.log_utils import * +location = 'output' #pdf文件处理后保存的文件夹 +@diskcache(location=location) +def func(file_path): + try: + df = pdf_layout(str(file_path)) #解析成字典 详见https://github.com/binzhouchn/deep_learning/blob/master/4_llm/1_%E5%90%91%E9%87%8F%E6%95%B0%E6%8D%AE%E5%BA%93/es/es.py 中的body字典 + with open(f'{location}/{file_path.stem}.txt', 'w', encoding='utf8') as f: + json.dump(df, f, ensure_ascii=False) + except Exception as e: + logger.debug(f"{file_path}: {e}") + logger.debug(f"{file_path}: {traceback.format_exc().strip()}") +if __name__ == '__main__': + ps = Path('./data/').glob('*.pdf') | xlist #将所有pdf文件都列出来 + dfs = ps | xProcessPoolExecutor(func, 16) | xlist #这里的16是进程数,一般cpu有N核就起N-1个进程 +``` ### cv的多进程实现 @@ -593,6 +794,21 @@ import pandas as pd pd.get_dummies(data.categ_id) ``` +方法三
@@ -716,7 +932,7 @@ df = df.merge(df_aggr, how='left', on='personid').fillna(0) ``` ### python画图显示中文 -f + ```python ## 显示中文解决方法 # 解决方法一 @@ -827,6 +1043,13 @@ sorted(l, key=lambda x:x[1], reverse=True) # Out[42]: [('c', 6), ('d', 4), ('e', 3), ('b', 2), ('a', 1)] ``` +用法一(衍生):
```python # 调整数组顺序使奇数位于偶数前面,奇偶相对顺序不变 @@ -1036,12 +1259,303 @@ def dict_to_object(_d): return inst ``` +### boost安装 + +```shell +sudo apt-get install libboost-all-dev +sudo apt install ocl-icd-opencl-dev +sudo apt install cmake(可以去https://cmake.org/files下载比如cmake-3.14.0.tar.gz然后执行./bootstrap然后make然后make install) +``` + +lgb gpu版安装
+```shell +pip install --upgrade pip +pip install lightgbm --install-option=--gpu +``` +xgb gpu版安装
+```shell +git clone --recursive https://github.com/dmlc/xgboost +cd xgboost +mkdir build +cd build +cmake .. -DUSE_CUDA=ON +make(或者make -j4可能或报错) + +cd .. +cd python-package +python setup.py install +``` + +### tqdm + +[当Pytorch遇上tqdm](https://blog.csdn.net/dreaming_coder/article/details/113486645)
+```python +for epoch in range(epoch): + with tqdm( + iterable=train_loader, + bar_format='{desc} {n_fmt:>4s}/{total_fmt:<4s} {percentage:3.0f}%|{bar}| {postfix}', + ) as t: + start_time = datetime.now() + loss_list = [] + for batch, data in enumerate(train_loader): + t.set_description_str(f"\33[36m【Epoch {epoch + 1:04d}】") + # 训练代码 + time.sleep(1) + # 计算当前损失 + loss = random() + loss_list.append(loss) + cur_time = datetime.now() + delta_time = cur_time - start_time + t.set_postfix_str(f"train_loss={sum(loss_list) / len(loss_list):.6f}, 执行时长:{delta_time}\33[0m") + t.update() +``` + +### joblib_parallel + + +```python +#Parallel for loop 此方法可用于多个文件数据并行读取 +from joblib import Parallel, delayed +from math import sqrt +def ff(num): + return [sqrt(n ** 3) for n in range(num)] +#不使用并行 7.5s +res = [] +for i in range(10,7000): + res.append(ff(i)) +#使用并行 2.75s +res = Parallel(n_jobs = -1, verbose = 1)(delayed(ff)(i) for i in range(10,7000)) +``` + +### 调试神器pysnooper + +```python +#pip install pysnooper +import os +os.environ['pysnooper'] = '1' # 开关 + +from pysnooper import snoop +#如果为0,则重新定义snoop然后这个修饰啥都不干 +if os.environ['pysnooper'] == '0': + import wrapt + def snoop(*args, **kwargs): + @wrapt.decorator + def wrapper(wrapped, instance, args, kwargs): + return wrapped(*args, **kwargs) + return wrapper +``` + +### 调试神器debugpy + +安装:pip install debugpy -U
+在python代码里面(最前面加上这句话)
+```python +import debugpy +try: + # 5678 is the default attach port in the VS Code debug configurations. Unless a host and port are specified, host defaults to 127.0.0.1 + debugpy.listen(("localhost", 9501)) + print("Waiting for debugger attach") + debugpy.wait_for_client() +except Exception as e: + pass + +``` + +在vscode软件中项目下新建一个.vscode目录,然后创建launch.json,看9501端口那个配置
+```python +{ + // 使用 IntelliSense 了解相关属性。 + // 悬停以查看现有属性的描述。 + // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "torchr_ex2", + "type": "python", + "request": "launch", + "program": "/Users/zb/anaconda3/envs/rag/bin/torchrun", + "console": "integratedTerminal", + "justMyCode": true, + "args": [ + "--nnodes", + "1", + "--nproc-per-node", + "2", + "${file}", + "--model_name_or_path", + "my_model_bz" + ] + }, + { + "name": "sh_file_debug", + "type": "debugpy", + "request": "attach", + "connect": { + "host": "localhost", + "port": 9501 + } + }, + ] +} +``` + +上面的端口号都写一样比如9501,别搞错了! + +### 分组计算均值并填充 + +```python +def pad_mean_by_group(df, gp_col='stock_id'): + # 只留下需要处理的列 + cols = [col for col in df.columns if col not in["stock_id", "time_id", "target", "row_id"]] + # 查询nan的列 + df_na = df[cols].isna() + # 根据分组计算平均值 + df_mean = df.groupby(gp_col)[cols].mean() + + # 依次处理每一列 + for col in cols: + na_series = df_na[col] + names = list(df.loc[na_series,gp_col]) + + t = df_mean.loc[names,col] + t.index = df.loc[na_series,col].index + + # 相同的index进行赋值 + df.loc[na_series,col] = t + return df +train_pca = pad_mean_by_group(train_pca) +``` + +### python日期处理 + +[80个例子,彻底掌握Python日期时间处理](https://mp.weixin.qq.com/s/2bJUZBfWS_8ULGrb9tRpmw)
+ +### dataclass + +dataclass 提供一个简便的方式创建数据类, 默认实现__init__(), __repr__(), __eq__()方法
+dataclass支持数据类型的嵌套
+支持将数据设置为不可变:@dataclass(frozen=True)
+ +不用dataclass
+ +```python +class Person: + def __init__(self, name, age): + self.name = name + self.age = age +p = Person('test', 18) +q = Person('test', 18) +#<__main__.Person at 0x7ff4ade66f40> +str(p) +repr(p) +#'<__main__.Person object at 0x7ff4ade66f40>' +p == q +#False +``` +```python +from typing import Any +from dataclasses import dataclass +@dataclass +class Person: + name: Any + age: Any = 18 +p = Person('test', 18) +q = Person('test', 18) +#Person(name='test', age=18) +str(p) +repr(p) +#"Person(name='test', age=18)" +p == q +#True +``` + +### md5_sha256 + +```python +import hashlib + +def enc(s, ed='md5'): + if ed == 'md5': + hash_object = hashlib.md5(s.encode()) + elif ed == 'sha256': + hash_object = hashlib.sha256(s.encode()) + else: + raise ValueError('unsupport type!') + hash_hex = hash_object.hexdigest() + return hash_hex + +for i in ['13730973320','13802198853','17619520726']: + print(enc(i,'md5')) +``` + +### 查看内存 + +有几种方法可以在Python中获取对象的大小。可以使用sys.getsizeof()来获取对象的确切大小,使用objgraph.show_refs()来可视化对象的结构,或者使用psutil.Process().memory_info()。RSS获取当前分配的所有内存。 + +```python +>>> import numpy as np +>>> import sys +>>> import objgraph +>>> import psutil +>>> import pandas as pd + +>>> ob = np.ones((1024, 1024, 1024, 3), dtype=np.uint8) + +### Check object 'ob' size +>>> sys.getsizeof(ob) / (1024 * 1024) +3072.0001373291016 + +### Check current memory usage of whole process (include ob and installed packages, ...) +>>> psutil.Process().memory_info().rss / (1024 * 1024) +3234.19140625 + +### Check structure of 'ob' (Useful for class object) +>>> objgraph.show_refs([ob], filename='sample-graph.png') + +### Check memory for pandas.DataFrame +>>> from sklearn.datasets import load_boston +>>> data = load_boston() +>>> data = pd.DataFrame(data['data']) +>>> print(data.info(verbose=False, memory_usage='deep')) + +RangeIndex: 506 entries, 0 to 505 +Columns: 13 entries, 0 to 12 +dtypes: float64(13) +memory usage: 51.5 KB + +### Check memory for pandas.Series +>>> data[0].memory_usage(deep=True) # deep=True to include all the memory used by underlying parts that construct the pd.Series +4176 +``` + +### slots用法 + +```python +#不使用__slots__时,可以很容易地添加一个额外的job属性 +class Author: + def __init__(self, name, age): + self.name = name + self.age = age + + me = Author('Yang Zhou', 30) + me.job = 'Software Engineer' + print(me.job) + # Software Engineer + +# 在大多数情况下,我们不需要在运行时更改实例的变量或方法,并且__dict__不会(也不应该)在类定义后更改。所以Python为此提供了一个属性:__slots__ +class Author: + __slots__ = ('name', 'age') + + def __init__(self, name, age): + self.name = name + self.age = age + + me = Author('Yang Zhou', 30) + me.job = 'Software Engineer' + print(me.job) + # AttributeError: 'Author' object has no attribute 'job' +``` -### -working on bert -working on bert 单句分类 -working on bert NER -working on bert 两句输入分类或其他 -working on 多分类任务 +
\ No newline at end of file diff --git a/01_basic/arg_test.py b/01_basic/arg_test.py index 0e65c6b..85f8a82 100644 --- a/01_basic/arg_test.py +++ b/01_basic/arg_test.py @@ -28,3 +28,4 @@ print(opt.dev_path) print('done.') + diff --git a/03_pandas/README.md b/03_pandas/README.md index c8be243..f1c7f58 100644 --- a/03_pandas/README.md +++ b/03_pandas/README.md @@ -1,5 +1,9 @@ ## 目录 +[pandas进阶修炼300题](https://www.heywhale.com/mw/project/6146c0318447b8001769ff20)
+ +[可以替代pandas比较好用的数据平行处理包](#数据平行处理) + [**1. pandas并行包**](#pandas并行包) [**2. pandas dataframe手动创建**](#pandas_dataframe手动创建) @@ -40,8 +44,27 @@ [**20. dataframe表格填充**](#dataframe表格填充) +[**21. 加快dataframe读取**](#加快dataframe读取) + +[**22. df热力图**](#df热力图) + +[**23. df热力地图**](#df热力地图) + +[**24. 2个pandas EDA插件**](#eda插件) + +[**25. python批量插入mysql数据库**](#python批量插入mysql数据库) + --- +### 数据平行处理 + +[polar]
+https://pola-rs.github.io/polars-book/user-guide/quickstart/intro.html
+https://pola-rs.github.io/polars/py-polars/html/reference
+ +[pandarallel](https://nalepae.github.io/pandarallel/)
+ + ### pandas_dataframe手动创建 手动创建dataframe @@ -321,4 +344,88 @@ df.loc[df.content_id=='x6mbO2rHfU3hTej4','sentiment_tmp'] = 1 df.fillna(method='ffill', axis=1).fillna(method='ffill') ``` +### 加快dataframe读取 + +方式一:cpu多线程读取(推荐)
+```python +#安装datatable==0.11.1 +import datatable as dtable +train = dtable.fread(path+'train.csv').to_pandas() +``` +方式二:gpu读取
+```python +#安装cudf(稍微有点麻烦) +import cudf +train = cudf.read_csv(path+'train.csv').to_pandas() +``` + +### df热力图 + +```python +df.corr().style.background_gradient(cmap='coolwarm').set_precision(2) +``` + +### df热力地图 + +结合pyecharts将各省市高校上榜数量进行地图可视化
+```python +from pyecharts import options as opts +from pyecharts.charts import Map +#省份 +list1 = ['北京','江苏','上海','广东','湖北','陕西','浙江','四川','湖南','山东','安徽','辽宁','重庆','福建','天津','吉林','河南','黑龙江','江西','甘肃','云南','河北'] +#省份对应的高效数量 +list2 = [18, 15, 10, 9, 7, 7, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1] +c = ( + Map() + .add('', [list(z) for z in zip(list1,list2)], "china",is_map_symbol_show=False) + .set_global_opts( + title_opts=opts.TitleOpts(title="排名前100高校各省市占比"), + visualmap_opts=opts.VisualMapOpts(max_=20), + + + ) +) +c.render_notebook() +``` + +### eda插件 + +```python +#插件一 +#!pip install pandas_profiling +import pandas_profiling +pandas_profiling.ProfileReport(df) +#插件二 +import sweetviz as sv +report = sv.analyze(df) +report.show_html() +``` + +### python批量插入mysql数据库 + +```python +df.to_numpy()[:5].tolist() +''' +[['25_B', 25, 'B', 0.6, '2024-08-12'], + ['23_C', 23, 'C', 2.2, '2024-08-12'], + ['24_D', 24, 'D', 3.8, '2024-08-12'], + ['29_E', 29, 'E', 1.5, '2024-08-12'], + ['22_F', 22, 'F', 4.1, '2024-08-12']] +''' + +import pymysql +MYSQL_W_CONFIG = {'host':'10.xx.xxx.xx', + 'port':3306, + 'user':'user', + 'password':'passwd', + 'database':'mydatabase', + 'charset':'utf8'} +conn = pymysql.connect(autocommit=True, **MYSQL_W_CONFIG) +cursor = conn.cursor() +sql = "insert into xx_table(id,cust_id,agcode,score,s_time) values(%s,%s,%s,%s,%s)" +cursor.executemany(sql, df_final.to_numpy().tolist()) +conn.commit() +conn.close() +#1w条数据批量插入大概0.45s左右 +``` \ No newline at end of file diff --git a/07_database/README.md b/07_database/README.md index 11f962f..92f8bd9 100644 --- a/07_database/README.md +++ b/07_database/README.md @@ -5,7 +5,7 @@ # 先下载镜像 docker pull mysql:5.5 # 运行容器 可以先把-v去掉 -docker run -p 3306:3306 --name mymysql -v $PWD/conf:/etc/mysql/conf.d -v $PWD/logs:/logs -v $PWD/data:/var/lib/mysql -e MYSQL_ROOT_PASSWORD=123456 -d mysql:5.5 +docker run -p 3306:3306 --name mymysql -v $PWD/conf:/etc/mysql/conf.d -v $PWD/logs:/logs -v $PWD/mysql_data:/var/lib/mysql -e MYSQL_ROOT_PASSWORD=123456 -d mysql:5.5 -p 3306:3306:将容器的 3306 端口映射到主机的 3306 端口。 -v -v $PWD/conf:/etc/mysql/conf.d:将主机当前目录下的 conf/my.cnf 挂载到容器的 /etc/mysql/my.cnf。 @@ -32,9 +32,11 @@ db.commit() ``` ## 2. Redis(docker version) + +![redis](imgs/redis_pic.png) ``` # 启动redis命令 -docker run --name docker-redis-test -p 6379:6379 -d redis:latest --requirepass "123456" +docker run --name docker-redis-test -p 6379:6379 -v $PWD/redis_data:/data -d redis:latest --requirepass "123456" # redis客户端连接命令 docker exec -it redis-cli # 进去以后的操作 diff --git a/07_database/es.md b/07_database/es.md new file mode 100644 index 0000000..a4a00cd --- /dev/null +++ b/07_database/es.md @@ -0,0 +1,150 @@ +# es存入768维度向量,以及向量查询(ES版本需要7.3之后) + +https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/semantic-search/semantic_search_quora_elasticsearch.py + + +```python +""" +This script contains an example how to perform semantic search with ElasticSearch. + +As dataset, we use the Quora Duplicate Questions dataset, which contains about 500k questions: +https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs + +Questions are indexed to ElasticSearch together with their respective sentence +embeddings. + +The script shows results from BM25 as well as from semantic search with +cosine similarity. + +You need ElasticSearch (https://www.elastic.co/de/elasticsearch/) up and running. Further, you need the Python +ElasticSearch Client installed: https://elasticsearch-py.readthedocs.io/en/master/ + +As embeddings model, we use the SBERT model 'quora-distilbert-multilingual', +that it aligned for 100 languages. I.e., you can type in a question in various languages and it will +return the closest questions in the corpus (questions in the corpus are mainly in English). +""" + +from sentence_transformers import SentenceTransformer, util +import os +from elasticsearch import Elasticsearch, helpers +import csv +import time +import tqdm.autonotebook + + + +es = Elasticsearch() + +model = SentenceTransformer('quora-distilbert-multilingual') + +url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv" +dataset_path = "quora_duplicate_questions.tsv" +max_corpus_size = 100000 + +#Download dataset if needed +if not os.path.exists(dataset_path): + print("Download dataset") + util.http_get(url, dataset_path) + +#Get all unique sentences from the file +all_questions = {} +with open(dataset_path, encoding='utf8') as fIn: + reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL) + for row in reader: + all_questions[row['qid1']] = row['question1'] + if len(all_questions) >= max_corpus_size: + break + + all_questions[row['qid2']] = row['question2'] + if len(all_questions) >= max_corpus_size: + break + +qids = list(all_questions.keys()) +questions = [all_questions[qid] for qid in qids] + +#Index data, if the index does not exists +if not es.indices.exists(index="quora"): + try: + es_index = { + "mappings": { + "properties": { + "question": { + "type": "text" + }, + "question_vector": { + "type": "dense_vector", + "dims": 768 + } + } + } + } + + es.indices.create(index='quora', body=es_index, ignore=[400]) + chunk_size = 500 + print("Index data (you can stop it by pressing Ctrl+C once):") + with tqdm.tqdm(total=len(qids)) as pbar: + for start_idx in range(0, len(qids), chunk_size): + end_idx = start_idx+chunk_size + + embeddings = model.encode(questions[start_idx:end_idx], show_progress_bar=False) + bulk_data = [] + for qid, question, embedding in zip(qids[start_idx:end_idx], questions[start_idx:end_idx], embeddings): + bulk_data.append({ + "_index": 'quora', + "_id": qid, + "_source": { + "question": question, + "question_vector": embedding + } + }) + + helpers.bulk(es, bulk_data) + pbar.update(chunk_size) + + except: + print("During index an exception occured. Continue\n\n") + + + + +#Interactive search queries +while True: + inp_question = input("Please enter a question: ") + + encode_start_time = time.time() + question_embedding = model.encode(inp_question) + encode_end_time = time.time() + + #Lexical search + bm25 = es.search(index="quora", body={"query": {"match": {"question": inp_question }}}) + + #Sematic search + sem_search = es.search(index="quora", body={ + "query": { + "script_score": { + "query": { + "match_all": {} + }, + "script": { + "source": "cosineSimilarity(params.queryVector, doc['question_vector']) + 1.0", + "params": { + "queryVector": question_embedding + } + } + } + } + }) + + print("Input question:", inp_question) + print("Computing the embedding took {:.3f} seconds, BM25 search took {:.3f} seconds, semantic search with ES took {:.3f} seconds".format(encode_end_time-encode_start_time, bm25['took']/1000, sem_search['took']/1000)) + + print("BM25 results:") + for hit in bm25['hits']['hits'][0:5]: + print("\t{}".format(hit['_source']['question'])) + + print("\nSemantic Search results:") + for hit in sem_search['hits']['hits'][0:5]: + print("\t{}".format(hit['_source']['question'])) + + print("\n\n========\n") +``` \ No newline at end of file diff --git a/07_database/faiss.md b/07_database/faiss.md new file mode 100644 index 0000000..8cecb90 --- /dev/null +++ b/07_database/faiss.md @@ -0,0 +1,11 @@ +# faiss向量搜索库 + +与es.md提到的es7.3向量搜索一样,faiss是更加专业的向量搜索工具 + +[实战入门faiss搜索bert最邻近句子:docker CPU镜像开箱即用,无需额外安装下载](https://mp.weixin.qq.com/s?__biz=MzA4NzkxNzM3Nw==&mid=2457484515&idx=1&sn=c13b27b09b4a7e2a31a1ee421b362540&chksm=87bc8acdb0cb03db46ca7cc0893e46d4078e925a3b35f717806315c0881f6ad75b2165df4a0f&cur_album_id=2002019450945896449&scene=189#wechat_redirect) + +[semantic_search_quora_faiss.py](https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/semantic-search/semantic_search_quora_faiss.py) + + + +## todo \ No newline at end of file diff --git a/07_database/imgs/redis_pic.png b/07_database/imgs/redis_pic.png new file mode 100644 index 0000000..06eb3ec Binary files /dev/null and b/07_database/imgs/redis_pic.png differ diff --git a/08_vscode/README.md b/08_vscode/README.md new file mode 100644 index 0000000..1118417 --- /dev/null +++ b/08_vscode/README.md @@ -0,0 +1,80 @@ +# vscode使用(版本1.86.2) + +## 1. 在VScode中添加远程Linux服务器中Docker容器中的Python解释器 + +**以dgx.6机器为例**
+```shell +# 第一步 创建容器 +nvidia-docker run -d --name myllm -p 8891:22 -v $PWD/llm:/workspace/llm -w /workspace/llm -it 10.xx.xx.xxx/zhoubin/llm:py311-cuda12.1.0-cudnn8-devel-ubuntu22.04 /bin/bash +注释: +[-p 8891:22]:把docker的端口号22映射到服务器的端口号8891。 +[-d]:容器后台运行,避免退出容器后容器自动关闭。 +[-v]:挂载和同步目录,服务器和docker内有一个文件夹保持同步。 +[-it]:确保docker后台交互运行。 +[10.xx.xx.xxx/zhoubin/llm:py311-cuda12.1.0-cudnn8-devel-ubuntu22.04]:镜像名。 +[/bin/bash]:docker内要运行的指令。 +``` +```shell +#第二步 在容器内安装ssh服务 +docker exec -it [容器ID] /bin/bash +# 更新apt-get +命令:apt-get update +# 安装vim +命令:apt-get install vim +# 安装openssh-server +命令:apt-get install openssh-server +# 设置root密码(docker里面的用户名和密码,我这边账号密码都是root/root) +命令:passwd +``` +```shell +# 第三步 配置/etc/ssh/sshd_config文件 +# 在文件/etc/ssh/sshd_config中添加下面的代码: +PubkeyAuthentication yes +PermitRootLogin yes + +# 第四步 重启ssh服务(好像每次停止容器后重启都需要运行下) +/etc/init.d/ssh restart +或 service ssh restart + +# 第五步 退出docker后,验证端口映射 +docker ps -a +docker port [容器ID] 22 +若结果输出“0.0.0.0:8891”,则说明端口映射正确。 +``` +```shell +# 第6步 本地电脑连接docker(见Termius dgx6_docker_llm) +ssh root@11.xx.xx.xxx -p 8891 ,密码是root +``` +```shell +# 使用VSCode连接远程主机上的docker container +# 打开VScode编辑器,按下快捷键“Ctrl+Shift+X”,查找安装“Remote Development”。安装完成后需要点击“reload”,然后按下快捷键“Ctrl+Shift+P”,输入“remote-ssh”,选择“open SSH Configuration file”,在文件xx/username/.ssh/config中添加如下内容: +Host llm_docker #Host随便起名字 + HostName 11.xxx.xx.x + User root + Port 8891 + +#保存后,按下快捷键"Ctrl+Shift+P",输入"remote-ssh",选择"Connect to Host...",然后点击"llm_docker",接着选择“Linux”,最后按提示输入第三步中设置的root连接密码,在左下角显示"SSH:llm_docker",说明已经成功连接docker。 +``` + +```shell +#内网环境远程如果出现连接不上,大概率是vscode-server无法下载导致,可以手动搞定 +https://update.code.visualstudio.com/commit:903b1e9d8990623e3d7da1df3d33db3e42d80eda/server-linux-x64/stable + +具体参考附录中的[VSCode连不上远程服务器] +``` + + +## 2. Debugging(自带,不需要额外安装插件) + +在Visual Studio Code(VSCode)中,[Debug Console](https://code.visualstudio.com/Docs/editor/debugging)是一个用于查看程序调试信息的窗口。它通常用于查看程序在调试过程中输出的日志信息、变量的值等。Debug Console提供了一个方便的方式来查看和分析程序的执行过程,帮助开发人员定位和解决代码中的问题。 + + +---- + +[vscode历史版本下载地址](https://code.visualstudio.com/updates/v1_86)
+[vscode扩展应用市场vsix文件手动下载安装](https://marketplace.visualstudio.com/search?target=VSCode&category=All%20categories&sortBy=Installs)
+[vscode插件历史版本下载https://open-vsx.org](https://open-vsx.org/)
+[vscode扩展应用市场vsix文件手动下载历史版本插件包](https://blog.csdn.net/qq_15054345/article/details/133884626)
+[在VScode中添加Linux中的Docker容器中的Python解释器](https://blog.csdn.net/weixin_43268590/article/details/129244984)
+[VSCode连不上远程服务器](https://blog.csdn.net/qq_42610612/article/details/132782965)
+[无网机的vscode中怎么使用jupyter notebook](https://www.bilibili.com/read/cv34411972/?jump_opus=1)
diff --git a/09_remote_ipython/README.md b/09_remote_ipython/README.md index a797579..19a1543 100644 --- a/09_remote_ipython/README.md +++ b/09_remote_ipython/README.md @@ -26,7 +26,7 @@ done! 1. 打开ipython ```python -from notebook.auth import passwd +from IPython.lib import passwd #from notebook.auth import passwd In [2] : passwd() # 输入密码 Enter password: Verify password: diff --git a/10_docker/README.md b/10_docker/README.md index 92a4827..b823beb 100644 --- a/10_docker/README.md +++ b/10_docker/README.md @@ -1,4 +1,4 @@ -# simple use for docker +# [docker入门实践](https://yeasy.gitbook.io/docker_practice/) ## 1. docker安装及配置Docker镜像站 @@ -7,7 +7,7 @@ [docker docs for mac](https://docs.docker.com/docker-for-mac/)
1.2 linux下安装
-TODO +[Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/) 1.3 配置docker镜像站
[docker镜像站网址](https://www.daocloud.io/mirror#accelerator-doc)
@@ -15,7 +15,8 @@ TODO 1.4 配置docker代理
- windows中右击图标,选settings->Proxies - - [mac/linux](https://www.cnblogs.com/EasonJim/p/9988154.html) + - [mac](https://www.cnblogs.com/EasonJim/p/9988154.html) + - [linux](https://blog.csdn.net/qq_30034989/article/details/132021346) ```shell # 如果使用HTTP代理服务器时,将为docker服务创建systemd插件目录 @@ -146,11 +147,21 @@ docker stop $(docker ps -a -q) docker rm $(docker ps -a -q) ``` -2.12 docker修改完镜像生成新的镜像以后貌似没看法删除旧的镜像 +2.12 虚悬镜像 + +上面的镜像列表中,还可以看到一个特殊的镜像,这个镜像既没有仓库名,也没有标签,均为
+```shell + 00285df0df87 5 days ago 342 MB +```
+这个镜像原本是有镜像名和标签的,原来为 mongo:3.2,随着官方镜像维护,发布了新版本后,重新 docker pull mongo:3.2 时,mongo:3.2 这个镜像名被转移到了新下载的镜像身上,而旧的镜像上的这个名称则被取消,从而成为了 。除了 docker pull 可能导致这种情况,docker build 也同样可以导致这种现象。由于新旧镜像同名,旧镜像名称被取消,从而出现仓库名、标签均为 的镜像。这类无标签镜像也被称为 虚悬镜像(dangling image) ,可以用下面的命令专门显示这类镜像:
+```shell +$ docker image ls -f dangling=true +REPOSITORY TAG IMAGE ID CREATED SIZE + 00285df0df87 5 days ago 342 MB +```
+一般来说,虚悬镜像已经失去了存在的价值,是可以随意删除的,可以用下面的命令删除。
```shell -pip install -i https://pypi.tuna.tsinghua.edu.cn/simple numpy -pandas sklearn jieba gensim tqdm flask requests PyMySQL redis pyahocorasick -pymongo pyspark py2neo neo4j-driver==$PYTHON_DRIVER_VERSION +$ docker image prune ``` 2.13 拷贝宿主机本地文件到docker中,和从docker中拷贝到宿主机 @@ -170,6 +181,14 @@ docker stop `docker ps -a| grep python:3.6 | awk '{print $1}'` docker rm `docker ps -a| grep python:3.6 | awk '{print $1}'` ``` +2.15 docker中python print不生效解决办法 +```shell +#方法一 显式调用flush +print("Hello www", flush=True) +#方法二 使用 "-u" 参数执行 python 命令 +sudo nvidia-docker run -v $PWD/masr_bz:/workspace/masr_bz -w /workspace/masr_bz binzhouchn/pytorch:1.7-cuda10.1-cudnn7-masr python -u train.py +``` + ## 3. docker镜像使用 @@ -411,6 +430,26 @@ docker pull stardog/stardog:latest docker run -v ~/stardog-6.2.2/:/var/opt/stardog -e STARDOG_SERVER_JAVA_ARGS="-Xmx8g -Xms8g -XX:MaxDirectMemorySize=2g" stardog/stardog:latest ``` + +3.8 容器云k8s + +Kubernetes是什么?Kubernetes是一个全新的基于容器技术的分布式架构解决方案,是Google开源的一个容器集群管理系统,Kubernetes简称K8S。Kubernetes 提供了完善的管理工具,这些工具涵盖了开发、部署测试、运维监控在内的各个环节。
+ +Kubernetes特性
+ - 自我修复:在节点故障时,重新启动失败的容器,替换和重新部署,保证预期的副本数量;杀死健康检查失败的容器,并且在未准备好之前不会处理用户的请求,确保线上服务不中断。 + - 弹性伸缩:使用命令、UI或者基于CPU使用情况自动快速扩容和缩容应用程序实例,保证应用业务高峰并发时的高可用性;业务低峰时回收资源,以最小成本运行服务。 + - 自动部署和回滚:K8S采用滚动更新策略更新应用,一次更新一个Pod,而不是同时删除所有Pod,如果更新过程中出现问题,将回滚更改,确保升级不影响业务。 + - 服务发现和负载均衡:K8S为多个容器提供一个统一访问入口(内部IP地址和一个DNS名称),并且负载均衡关联的所有容器,使得用户无需考虑容器IP问题。 + - 机密和配置管理:管理机密数据和应用程序配置,而不需要把敏感数据暴露在镜像里,提高敏感数据安全性。并可以将一些常用的配置存储在K8S中,方便应用程序使用。 + - 存储编排:挂载外部存储系统,无论是来自本地存储,公有云,还是网络存储,都作为集群资源的一部分使用,极大提高存储使用灵活性。 + - 批处理:提供一次性任务,定时任务;满足批量数据处理和分析的场景。 + +[Kubernetes 深入学习(一) —— 入门和集群安装部署](https://www.cnblogs.com/chiangchou/p/k8s-1.html#_label0_0)
+[Kubernetes(一) 跟着官方文档从零搭建K8S](https://juejin.cn/post/6844903943051411469)
+[kubeadm部署k8s集群最全最详细](https://blog.csdn.net/Doudou_Mylove/article/details/103901732)
+ + + [RDF入门](https://blog.csdn.net/txlCandy/article/details/50959358)
[OWL语言](https://blog.csdn.net/zycxnanwang/article/details/86557350)
diff --git a/10_docker/newapi_docker_demo/README.md b/10_docker/newapi_docker_demo/README.md new file mode 100644 index 0000000..39dcc95 --- /dev/null +++ b/10_docker/newapi_docker_demo/README.md @@ -0,0 +1,94 @@ +## Dockerfile + +```Dockerfile +FROM node:16 as builder + +WORKDIR /build +COPY web/package.json . +RUN npm install +COPY ./web . +COPY ./VERSION . +RUN DISABLE_ESLINT_PLUGIN='true' VITE_REACT_APP_VERSION=$(cat VERSION) npm run build + +FROM golang AS builder2 + +ENV GO111MODULE=on \ + CGO_ENABLED=1 \ + GOOS=linux + +WORKDIR /build +ADD go.mod go.sum ./ +RUN go mod download +COPY . . +COPY --from=builder /build/dist ./web/dist +RUN go build -ldflags "-s -w -X 'one-api/common.Version=$(cat VERSION)' -extldflags '-static'" -o one-api + +FROM alpine + +RUN apk update \ + && apk upgrade \ + && apk add --no-cache ca-certificates tzdata \ + && update-ca-certificates 2>/dev/null || true + +COPY --from=builder2 /build/one-api / +EXPOSE 3000 +WORKDIR /data +ENTRYPOINT ["/one-api"] +``` + + +## Dockerfile 解析 + +这个 Dockerfile 通过多个阶段构建一个含前端和后端组件的应用。每个阶段使用不同的基础镜像和步骤来完成特定的任务。 + +### 第一阶段:前端构建(Node.js) + +- **基础镜像**: + - `FROM node:16 as builder`:使用 Node.js 16 版本的官方镜像作为基础镜像,并标记此构建阶段为 `builder`。 +- **设置工作目录**: + - `WORKDIR /build`:将工作目录设置为 `/build`。 +- **复制文件**: + - `COPY web/package.json .`:将前端代码目录下的 `package.json` 文件复制到工作目录中。 +- **安装依赖**: + - `RUN npm install`:根据 `package.json` 安装所需依赖。 +- **复制前端代码和版本文件**: + - `COPY ./web .`:将web文件夹下所有文件复制到工作目录。 + - `COPY ./VERSION .`:将项目版本文件复制到工作目录。 +- **构建前端项目**: + - `RUN DISABLE_ESLINT_PLUGIN='true' VITE_REACT_APP_VERSION=$(cat VERSION) npm run build`:设置环境变量并执行前端构建脚本,生成生产环境用的前端文件。 + +### 第二阶段:后端构建(Go) + +- **基础镜像**: + - `FROM golang AS builder2`:使用 Go 的官方镜像作为基础,并标记此阶段为 `builder2`。 +- **环境变量**: + - 设置多个环境变量,以支持 Go 的模块系统和确保生成的是适用于 Linux 的静态链接二进制文件。 +- **设置工作目录**: + - `WORKDIR /build`:设置工作目录。 +- **添加 Go 模块文件**: + - `ADD go.mod go.sum ./`:添加 Go 模块定义文件。 +- **下载依赖**: + - `RUN go mod download`:下载 Go 依赖。 +- **复制代码和前端构建产物**: + - `COPY . .`:复制所有后端代码到工作目录。 + - `COPY --from=builder /build/dist ./web/dist`:从第一阶段中复制构建好的前端文件到后端服务目录中。 +- **编译应用**: + - `RUN go build -ldflags "-s -w -X 'one-api/common.Version=$(cat VERSION)' -extldflags '-static'" -o one-api`:使用 Go 编译命令构建应用,设置链接器选项以嵌入版本信息并优化二进制大小。 + +### 第三阶段:运行环境 + +- **基础镜像**: + - `FROM alpine`:使用轻量级的 Alpine Linux 镜像作为基础。 +- **安装证书和时区数据**: + - 运行一系列命令以安装必要的证书和时区数据,确保应用可以处理 HTTPS 连接和正确的时间。 +- **复制编译好的应用**: + - `COPY --from=builder2 /build/one-api /`:从第二阶段复制编译好的应用到根目录。 +- **端口和工作目录**: + - `EXPOSE 3000`:声明容器在运行时会监听 3000 端口。 + - `WORKDIR /data`:设置工作目录,应用可能会使用此目录来存储数据。 +- **设置入口点**: + - `ENTRYPOINT ["/one-api"]`:设置容器启动时执行的命令。 + +### 总结 + +此 Dockerfile 首先构建前端资源,然后构建后端服务,并将前端资源集成到后端服务中,最后在一个轻量级容器中运行编译好的二进制文件,实现前后端的自动化构建和部署。 diff --git a/12_nginx/README.md b/12_nginx/README.md index 96a4c31..15f65bc 100644 --- a/12_nginx/README.md +++ b/12_nginx/README.md @@ -1,11 +1,17 @@ ## nginx -[nginx作为http服务器-静态页面的访问](https://www.cnblogs.com/xuyang94/p/12667844.html)
-[docker nginx反向代理](https://www.cnblogs.com/dotnet261010/p/12596185.html)
-[nginx负载均衡参考1](https://www.jianshu.com/p/4c250c1cd6cd)
-[nginx负载均衡参考2](https://www.cnblogs.com/diantong/p/11208508.html)
+[**1. nginx入门使用**](#nginx入门使用) + +[**2. nginx正则使用1(2024.4.2更新)**](#nginx正则使用1) + + -### nginx使用 +--- + +### nginx入门使用 + +
+点击展开 **1. 第一步用安装docker nginx** @@ -72,4 +78,31 @@ docker run --name=nginx -d -p 4030:4030 nginx def custom(): return str(3 + 2) ``` -[配置文件2](default2.conf) \ No newline at end of file +[配置文件2](default2.conf) + +
+ +### nginx正则使用1 + +```shell +cd /etc/nginx/conf.d +#修改后重启 +systemctl restart nginx +nginx -s reload +``` +[配置文件3](default3.conf) + +说明:本次使用正则的目的是当我访问 +http://10.28.xx.xx:8000/aimanager_gpu/recsys/时, +正则匹配后转到http://localhost:10086,后面不加/aimanager_gpu/recsys路由 +(如果不走正则那么proxy_pass转到http://localhost:10086后会自动拼接/aimanager_gpu/recsys) + + + + + - 参考资料 + +[nginx作为http服务器-静态页面的访问](https://www.cnblogs.com/xuyang94/p/12667844.html)
+[docker nginx反向代理](https://www.cnblogs.com/dotnet261010/p/12596185.html)
+[nginx负载均衡参考1](https://www.jianshu.com/p/4c250c1cd6cd)
+[nginx负载均衡参考2](https://www.cnblogs.com/diantong/p/11208508.html)
\ No newline at end of file diff --git a/12_nginx/default3.conf b/12_nginx/default3.conf new file mode 100644 index 0000000..2253947 --- /dev/null +++ b/12_nginx/default3.conf @@ -0,0 +1,15 @@ +upstream recsys { + server localhost:10086; + } + +server { + server_name localhost; + listen 8000; + location ~* /aimanager_gpu/recsys/ { + if ($request_uri ~ /aimanager_gpu/recsys/(.+)) + { + set $rightUrl $1; + } + proxy_pass http://recsys/$rightUrl; + } +} \ No newline at end of file diff --git a/14_go/README.md b/14_go/README.md new file mode 100644 index 0000000..a3e9da9 --- /dev/null +++ b/14_go/README.md @@ -0,0 +1,192 @@ +# python调用golang + +## 示例一 python端输入int返回int + +```Go +package main + +import ( + "C" +) + +func f1(x int) int { + return x*x + 2 +} + +//export Fib +func Fib(n int) int { + if n == 1 || n == 2 { + return 1 + } else { + return Fib(n-1) + Fib(n-2) + f1(1) + } +} + +func main() {} +``` + +//go build -buildmode=c-shared -o _fib.so fib.go
+//参考链接https://blog.csdn.net/cainiao_python/article/details/107724309
+//将_fib.so文件拷贝到python文件夹下
+ +```python +import ctypes +import time +from ctypes import * +so = ctypes.cdll.LoadLibrary('./_fib.so') +start = time.time() +result = so.Fib(40) +end = time.time() +print(f'斐波那契数列第40项:{result},耗时:{end - start}') +``` + +## 示例二 python端输入string返回string(推荐看示例三) + +```Go +package main + +import ( + "C" + "database/sql" + "log" + "strings" + + _ "github.com/go-sql-driver/mysql" +) + +//export Gdbc +func Gdbc(uri *C.char) string { + log.Println(uri) + db, err := sql.Open("mysql", C.GoString(uri)) + if err != nil { + log.Fatalln(err) + } + rows, err := db.Query("SELECT feature_word FROM insurance_qa.feature_words") + if err != nil { + log.Fatalln(err) + } + res := []string{} + for rows.Next() { + var s string + err = rows.Scan(&s) + if err != nil { + log.Fatalln(err) + } + // log.Printf("found row containing %q", s) + res = append(res, s) + } + rows.Close() + return strings.Join(res, ",") +} + +func main() { + // res := Gdbc("username:password@tcp(localhost:3306)/database?charset=utf8") + // fmt.Println(res) +} +``` +//go build -buildmode=c-shared -o _gdbc.so test.go
+//将_gdbc.so文件拷贝到python文件夹下
+ +```python +import ctypes +import time +from ctypes import * +class StructPointer(Structure): + _fields_ = [("p", c_char_p), ("n", c_longlong)] + +so = ctypes.cdll.LoadLibrary('./_gdbc.so') +so.Gdbc.restype = StructPointer +start = time.time() +uri = "username:password@tcp(localhost:3306)/database?charset=utf8" +res = so.Gdbc(uri.encode("utf-8")) +print(res.n) +print(res.p[:res.n].decode())#print(res.p.decode())这样貌似也没问题 +end = time.time() +print(f'耗时:{end - start}') +``` + +## 示例三 python端输入string,go查询数据库然后返回json str + +```Go +package main + +import ( + "C" + "database/sql" + "encoding/json" + "log" + + _ "github.com/go-sql-driver/mysql" +) + +type Fw struct { + feature_word string + word_type string + id int64 +} + +//export Gdbc +func Gdbc(uri *C.char) string { + db, err := sql.Open("mysql", C.GoString(uri)) + //设置数据库最大连接数 + db.SetConnMaxLifetime(100) + //设置上数据库最大闲置连接数 + db.SetMaxIdleConns(10) + if err != nil { + log.Fatalln(err) + } + rows, err := db.Query("SELECT feature_word,word_type,id FROM insurance_qa.feature_words") + if err != nil { + log.Fatalln(err) + } + res := [][]interface{}{} + var fw Fw + for rows.Next() { + err = rows.Scan(&fw.feature_word, &fw.word_type, &fw.id) + if err != nil { + log.Fatalln(err) + } + // log.Printf("found row containing %q", s) + tmp := []interface{}{} + tmp = append(tmp, fw.feature_word) + tmp = append(tmp, fw.word_type) + tmp = append(tmp, fw.id) + res = append(res, tmp) + // res = append(res, []interface{}{fw.feature_word, fw.word_type, fw.id})//上面的一行写法 + } + rows.Close() + b, err := json.Marshal(res) + if err != nil { + panic(err) + } + result := string(b) + return result +} + +func main() {} + +``` + +//go build -buildmode=c-shared -o _gdbc.so test.go
+//将_gdbc.so文件拷贝到python文件夹下
+ +```python +import ctypes +import time +import json +from ctypes import * +class StructPointer(Structure): + _fields_ = [("p", c_char_p), ("n", c_longlong)] + +so = ctypes.cdll.LoadLibrary('./_gdbc.so') +so.Gdbc.restype = StructPointer +start = time.time() +uri = "username:password@tcp(localhost:3306)/database?charset=utf8" +res = so.Gdbc(uri.encode("utf-8")) +print(res.n) +print(res.p.decode()) +print(json.loads(res.p.decode())) +end = time.time() +``` + +## \ No newline at end of file diff --git a/15_ansible/README.md b/15_ansible/README.md new file mode 100644 index 0000000..9b2aff0 --- /dev/null +++ b/15_ansible/README.md @@ -0,0 +1,66 @@ +# ansible笔记 + +```shell +在/etc/ansible/ansible.cfg下配置[model] +# ping +ansible model -m ping +# ansible-playbook写剧本 +ansible-playbook xxx.yaml +# 传文件 +ansible model -m copy -a "src=./test.txt dest=/home/zhoubin" +# 创建文件(ansible-playbook形式) +- hosts: model + remote_user: zhoubin + tasks: + - name: "create test2.txt in the /etc directory" + file: + path: "/home/zhoubin/test2.txt" + state: "touch" +# 创建文件夹(ansible-playbook形式) +- hosts: model + remote_user: zhoubin + tasks: + - name: "create tmp file in the /etc directory" + file: + path: "/home/zhoubin/tmp" + state: "directory" +# 删除文件(ansible-playbook形式) +- hosts: model + remote_user: zhoubin + tasks: + - name: "delete test.txt in the /etc directory" + file: + path: "/home/zhoubin/test.txt" + state: "absent" +# 删除多个文件(ansible-playbook形式) +- hosts: model + remote_user: zhoubin + tasks: + - name: "delete multi files in the /etc directory" + file: + path: "{{ item }}" + state: "absent" + with_items: + - /home/zhoubin/test1.txt + - /home/zhoubin/test2.txt +# 将远程服务器文件拷贝到本机 +ansible model -m fetch -a "src=/home/zhoubin/test.txt dest=./ force=yes backup=yes" + +# 写一个剧本(传docker镜像并且加载) become:yes可以避免sudo输密码! +- hosts: model + remote_user: zhoubin + tasks: + - name: copy docker image + copy: src=./py37.tar.gz dest=/home/zhoubin + - name: load image + shell: docker load -i /home/zhoubin/py37.tar.gz + become: yes + + +``` + + +### 附录 + +[超简单ansible2.4.2.0与playbook入门教程](https://blog.csdn.net/qq_45206551/article/details/105004233)
+[ansible-命令使用说明](https://www.cnblogs.com/scajy/p/11353825.html)
diff --git a/99_pycharm_archive/.DS_Store b/99_pycharm_archive/.DS_Store new file mode 100644 index 0000000..bec7f6f Binary files /dev/null and b/99_pycharm_archive/.DS_Store differ diff --git a/08_pycharm/README.md b/99_pycharm_archive/README.md similarity index 100% rename from 08_pycharm/README.md rename to 99_pycharm_archive/README.md diff --git a/08_pycharm/pic/pycharm_activ.png b/99_pycharm_archive/pic/pycharm_activ.png similarity index 100% rename from 08_pycharm/pic/pycharm_activ.png rename to 99_pycharm_archive/pic/pycharm_activ.png diff --git a/08_pycharm/pic/pycharm_git1.png b/99_pycharm_archive/pic/pycharm_git1.png similarity index 100% rename from 08_pycharm/pic/pycharm_git1.png rename to 99_pycharm_archive/pic/pycharm_git1.png diff --git a/08_pycharm/pic/pycharm_git2.png b/99_pycharm_archive/pic/pycharm_git2.png similarity index 100% rename from 08_pycharm/pic/pycharm_git2.png rename to 99_pycharm_archive/pic/pycharm_git2.png diff --git a/08_pycharm/pic/pycharm_remote1.png b/99_pycharm_archive/pic/pycharm_remote1.png similarity index 100% rename from 08_pycharm/pic/pycharm_remote1.png rename to 99_pycharm_archive/pic/pycharm_remote1.png diff --git a/08_pycharm/pic/pycharm_remote2.png b/99_pycharm_archive/pic/pycharm_remote2.png similarity index 100% rename from 08_pycharm/pic/pycharm_remote2.png rename to 99_pycharm_archive/pic/pycharm_remote2.png diff --git a/08_pycharm/pic/pycharm_remote3.png b/99_pycharm_archive/pic/pycharm_remote3.png similarity index 100% rename from 08_pycharm/pic/pycharm_remote3.png rename to 99_pycharm_archive/pic/pycharm_remote3.png diff --git a/08_pycharm/pic/pycharm_remote4.png b/99_pycharm_archive/pic/pycharm_remote4.png similarity index 100% rename from 08_pycharm/pic/pycharm_remote4.png rename to 99_pycharm_archive/pic/pycharm_remote4.png diff --git a/08_pycharm/pic/pycharm_remote5.png b/99_pycharm_archive/pic/pycharm_remote5.png similarity index 100% rename from 08_pycharm/pic/pycharm_remote5.png rename to 99_pycharm_archive/pic/pycharm_remote5.png diff --git "a/99_pycharm_archive/\346\277\200\346\264\273\347\240\201/.DS_Store" "b/99_pycharm_archive/\346\277\200\346\264\273\347\240\201/.DS_Store" new file mode 100644 index 0000000..07fbdca Binary files /dev/null and "b/99_pycharm_archive/\346\277\200\346\264\273\347\240\201/.DS_Store" differ diff --git "a/08_pycharm/\346\277\200\346\264\273\347\240\201/jetbrains-agent.jar" "b/99_pycharm_archive/\346\277\200\346\264\273\347\240\201/jetbrains-agent.jar" similarity index 100% rename from "08_pycharm/\346\277\200\346\264\273\347\240\201/jetbrains-agent.jar" rename to "99_pycharm_archive/\346\277\200\346\264\273\347\240\201/jetbrains-agent.jar" diff --git "a/08_pycharm/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\201.txt" "b/99_pycharm_archive/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\201.txt" similarity index 100% rename from "08_pycharm/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\201.txt" rename to "99_pycharm_archive/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\201.txt" diff --git "a/08_pycharm/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\2011.txt" "b/99_pycharm_archive/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\2011.txt" similarity index 100% rename from "08_pycharm/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\2011.txt" rename to "99_pycharm_archive/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\2011.txt" diff --git "a/08_pycharm/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\2012.txt" "b/99_pycharm_archive/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\2012.txt" similarity index 100% rename from "08_pycharm/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\2012.txt" rename to "99_pycharm_archive/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\2012.txt" diff --git "a/08_pycharm/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\2013.txt" "b/99_pycharm_archive/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\2013.txt" similarity index 100% rename from "08_pycharm/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\2013.txt" rename to "99_pycharm_archive/\346\277\200\346\264\273\347\240\201/\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/\346\277\200\346\264\273\347\240\2013.txt" diff --git "a/08_pycharm/\346\277\200\346\264\273\347\240\201/\351\235\236\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/Pycharm\346\226\271\345\274\217\344\270\200\346\277\200\346\264\273\347\240\201\346\261\207\346\200\273.docx" "b/99_pycharm_archive/\346\277\200\346\264\273\347\240\201/\351\235\236\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/Pycharm\346\226\271\345\274\217\344\270\200\346\277\200\346\264\273\347\240\201\346\261\207\346\200\273.docx" similarity index 100% rename from "08_pycharm/\346\277\200\346\264\273\347\240\201/\351\235\236\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/Pycharm\346\226\271\345\274\217\344\270\200\346\277\200\346\264\273\347\240\201\346\261\207\346\200\273.docx" rename to "99_pycharm_archive/\346\277\200\346\264\273\347\240\201/\351\235\236\346\260\270\344\271\205\346\277\200\346\264\273\347\240\201/Pycharm\346\226\271\345\274\217\344\270\200\346\277\200\346\264\273\347\240\201\346\261\207\346\200\273.docx" diff --git a/README.md b/README.md index fbfa3cd..561cd7a 100644 --- a/README.md +++ b/README.md @@ -2,23 +2,28 @@ [![Analytics](https://ga-beacon.appspot.com/GA-80121379-2/notes-python)](https://github.com/binzhouchn/feature_engineering) # python笔记 -> 版本:0.3
+> 版本:0.5
> 作者:binzhou
> 邮件:binzhouchn@gmail.com
`Github`加载`ipynb`的速度较慢,建议在 [Nbviewer](http://nbviewer.ipython.org/github/lijin-THU/notes-python/blob/master/index.ipynb) 中查看该项目。 +[python各版本下载仓库](https://www.python.org/ftp/python/)
+ --- ## 简介 -默认安装了 `Python 3.8`,以及相关的第三方包 `gensim`, `tqdm`, `flask` +默认安装了 `Python 3.10`,以及相关的第三方包 `gensim`, `tqdm`, `flask` + +anaconda 虚拟环境创建python版本降级命令:conda create -n tableqa python=3.9 > life is short.use python. 推荐使用[Anaconda](http://www.continuum.io/downloads),这个IDE集成了大部分常用的包。 -### python pip使用国内镜像 +
+pip使用国内镜像 [让python pip使用国内镜像](https://www.cnblogs.com/wqpkita/p/7248525.html) ```shell @@ -33,7 +38,10 @@ pip install -i http://pypi.douban.com/simple --trusted-host pypi.douban.com flas pip --proxy=proxyAddress:port install -i http://pypi.douban.com/simple --trusted-host pypi.douban.com flask ``` -### pip镜像配置 +
+ +
+pip镜像配置 pip install镜像配置(Linux) ``` @@ -55,3 +63,28 @@ index-url = http://mirrors.aliyun.com/pypi/simple/ [install] trusted-host = mirrors.aliyun.com ``` +
+ +## 使用conda升级到python3.12 + +方法一
+https://qa.1r1g.com/sf/ask/4099772281/)
+```shell +conda update -n base -c defaults conda +conda install -c anaconda python=3.12 +#然后再重新安装下依赖包 +``` +方法二(或使用虚拟环境)
+``` +$ conda create -p /your_path/env_name python=3.12 +# 激活环境 +$ source activate /your_path/env_name +# 关闭环境 +$ source deactivate /your_path/env_name +# 删除环境 +$ conda env remove -p /your_path/env_name +``` + +## 其他python仓库推荐 + +[All algorithms implemented in Python - for education](https://github.com/TheAlgorithms/Python/)