| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647 |
- import pandas as pd
- import polars as pl
- import yaml
- from pathlib import Path
- columns = ["专利申请号", "IPC分类号", "专利申请日", "摘要"]
- patent_file_name = '~/Documents/论文/上市公司-专利明细数据.csv'
- desc_file_name = '~/Documents/论文/上市公司-专利摘要数据.csv'
- demo_file_name = '~/Documents/论文/上市公司-建筑工业化-专利摘要数据-样本.csv'
- def read_csv():
- # 逐块读取(每次读取 100,000 行)
- chunk_size = 10
- for chunk in pd.read_csv(patent_file_name, chunksize=chunk_size, encoding="utf-8"):
- # 处理当前块数据
- print("Processing a chunk of size:", len(chunk))
- # read_csv()
- def get_ipc_df(ipcs):
- # 根据给定的ipc列表读取专利数据
- # 读取专利文件指定的列
- df = pl.read_csv(str(Path(patent_file_name).expanduser()), columns=columns, encoding="utf-8")
- # 构造IPC正则表达式
- regex_pattern = "|".join(ipcs)
- # 按IPC正则表达式筛选数据
- df = df.filter(pl.col('IPC分类号').str.contains(regex_pattern))
- return df
- if __name__ == '__main__':
- # 筛选并保存跟建筑工业化相关IPC的专利数据
- ipc_yaml = Path('building_ipc.yml').read_text()
- ipc_data = yaml.safe_load(ipc_yaml)
- keywords_list = list(ipc_data.keys())
- df = get_ipc_df(keywords_list)
- df.write_csv(desc_file_name)
- # 抽取5%样本并保存
- df = df.sample(fraction=0.05)
- df.write_csv(demo_file_name)
|