| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- import prefect
- from prefect import flow, task
- import time
- import random
- import data_preparation
- import bio
- import ner
- from env import env
- # 定义任务 (Tasks)
- @task
- def filter_patent(ipcs: list, start=None, end=None):
- """
- 从文件 上市公司-专利明细数据.csv 中筛选专利数据
- :param ipcs: 需要过滤的 ipc 分类列表,
- :param start: 开始日期,'2014-01-01'
- :param end: 结束日期
- :return:
- """
- data_preparation.save_building_csv(ipcs, start, end)
- def sample_patent(fraction=0.05):
- """
- 从筛选的专利数据中随机取样,结果保存在文件:上市公司-专利摘要数据-筛选.csv
- :param fraction: 取样率
- :return:
- """
- data_preparation.save_building_csv(fraction)
- def deepseek_ner_sample():
- """
- 将取样专利数据交由deepseek进行命名实体识别,结果保存在文件:ds_sample_ner_result.yml
- :return:
- """
- bio.ner_sample_by_deepseek()
- @task
- def cut_words():
- ...
- def add_bio():
- """
- 根据 ds_sample_ner_result.yml 文件中识别出的命名实体进行BIO标注,结果保存在文件:上市公司-专利摘要数据-筛选-样本-BIO.csv
- :return:
- """
- bio.add_bio()
- def train(model_name):
- """
- 训练模型
- :return:
- """
- ner.train(model_name)
- def model_test():
- ...
- def _ner():
- ...
- def lda():
- ...
- if __name__ == '__main__':
- env.data_folder = '/Users/gaojun/dev/gxy-gd/论文'
- env.output_folder = '/Users/gaojun/dev/gxy-gd/论文'
- # ner.predict_test('hfl/chinese-roberta-wwm-ext-large-building-building')
- ner.train('hfl/chinese-roberta-wwm-ext-large-building-building')
- #
- # # 定义参数化的 Flow
- # @flow(name="Multi-Parameter Experiment Flow", log_prints=True) # log_prints=True 会自动捕获 print 语句到 Prefect 日志
- # def run_experiment(
- # data_source: str = "default_source", # 参数1: 数据源
- # process_level: float = 0.5, # 参数2: 处理级别
- # report_type: str = "txt" # 参数3: 报告格式
- # ):
- # """
- # 这是一个接收多个参数的实验性数据流。
- # """
- # run_id = prefect.runtime.flow_run.id # 获取当前 Flow Run 的 ID
- # print(f"开始 Flow Run: {run_id}")
- # print(f"参数: data_source='{data_source}', process_level={process_level}, report_type='{report_type}'")
- #
- # # 调用 Tasks
- # raw_data = load_data(source=data_source)
- # processing_result = process_data(data=raw_data, processing_param=process_level)
- # final_report = generate_report(result=processing_result, report_format=report_type)
- #
- # print(f"Flow Run {run_id} 完成.")
- # return final_report # Flow 可以返回值,这也会被记录
- #
- # if __name__ == "__main__":
- # # 定义你想要试验的参数组合
- # parameter_combinations = [
- # {"data_source": "source_A", "process_level": 0.3, "report_type": "json"},
- # {"data_source": "source_A", "process_level": 0.8, "report_type": "csv"},
- # {"data_source": "source_B", "process_level": 0.5, "report_type": "txt"},
- # {"data_source": "source_C", "process_level": 0.9, "report_type": "json"},
- # {"data_source": "source_C", "process_level": 0.2, "report_type": "txt"},
- # ]
- #
- # print(f"准备执行 {len(parameter_combinations)} 组实验...")
- #
- # # 循环遍历参数组合,为每一组参数执行一次 Flow
- # results = []
- # for i, params in enumerate(parameter_combinations):
- # print(f"\n--- 开始第 {i+1} 次实验: {params} ---")
- # # 直接调用 flow 函数,并传入参数
- # # Prefect 会自动捕获这次调用为一个 Flow Run
- # try:
- # # 使用 **params 将字典解包为关键字参数传递给 flow 函数
- # result = run_experiment(**params)
- # results.append({"params": params, "result": result, "status": "Success"})
- # print(f"--- 第 {i+1} 次实验完成 ---")
- # except Exception as e:
- # results.append({"params": params, "result": str(e), "status": "Failed"})
- # print(f"--- 第 {i+1} 次实验失败: {e} ---")
- #
- # print("\n所有本地实验执行完毕。")
- # # 你可以在这里对 results 列表进行分析
- # # print("实验结果汇总:", results)
|