python 拆分(几G)的tsv文件为较小的csv文件
生活随笔
收集整理的這篇文章主要介紹了
python 拆分(几G)的tsv文件为较小的csv文件
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
?
import pandas as pd
import os
import _thread?# 獲取分塊數據
# tsv_name: tsv文件名
# chunk_size: 分塊大小
# encoding_type: 編碼格式
def read_data(tsv_name: str, chunk_size: int, encoding_type: str):return pd.read_csv(tsv_name, sep="\t", chunksize=chunk_size, encoding=encoding_type, low_memory=False)# 高效計算文件行數
# file_name: 攜帶路徑的文件名字
def iter_count(file_name: str):from itertools import (takewhile, repeat)buffer = 1024 * 1024with open(file_name, errors='ignore') as f:buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))return sum(buf.count('\n') for buf in buf_gen)# 拆分大tsv文件為小一些的csv文件
# tsv_name: tsv文件名
# split_num: 拆分個數
# encoding_type: 編碼格式
def split_data(tsv_name: str, split_num: int, encoding_type: str):splits_dir = f"./splits_{tsv_name[-8:-4]}"if not os.path.exists(splits_dir):os.mkdir(splits_dir)# 計算分塊大小count = iter_count(tsv_name)chunk_size = count // split_numif count % split_num != 0:chunk_size += 1data = read_data(tsv_name, chunk_size, encoding_type)# 多線程寫入分塊數據def write_data(_idx: int, _chunk):file_path = f"{splits_dir}/chunk_{tsv_name[-8:-4]}_{_idx + 1}.csv"_chunk.to_csv(file_path, index=False)for idx, chunk in enumerate(data):_thread.start_new_thread(write_data, (idx, chunk))
后來改進的方法:
?@staticmethoddef readTsvData(path: str, encoding: str = "gb18030", start: int = 0, chunkSize: int = 10000):"""讀取整個tsv文件數據:param encoding: str 編碼格式:param path: str tsv文件路徑:param start: int 文件起始讀入位置:param chunkSize: int 文件讀取大小:return: DataFrame"""data = []header = Nonewith codecs.open(path, 'rb', encoding, errors="ignore") as tsvFile:for line in tsvFile:# 存入列索引temp1 = line.split("\t")temp1[-1] = temp1[-1][:-2]header = temp1breakfor line in tsvFile.readlines()[start:(start + chunkSize)]:temp1 = line.split("\t")temp1[-1] = temp1[-1][:-2]data.append(temp1)return pd.DataFrame(data, columns=header)@classmethoddef splitDataByTrunk(cls, file_path: str, chunk_size: int, save_path: str, encoding: str = "gb18030"):"""將數據按照片數分片并持久化存儲:param save_path: 存儲的文件夾路徑:param file_path: str 讀取的tsv文件路徑:param chunk_size: int 分片大小:param encoding: int 編碼格式:return: void 最后存儲的文件為utf-8編碼的csv文件"""start = 0save_path = save_path + "/" + file_path.split("/")[-1][:-4]if not os.path.exists(save_path):os.mkdir(save_path)while True:data = Pretreatment.readTsvData(file_path, encoding=encoding, start=start, chunkSize=chunk_size)data.to_csv(save_path + "/" + file_path.split("/")[-1][:-4] + "_" + str(start) + ".csv", encoding="utf-8")start += chunk_sizeif len(data) != chunk_size:break?總結
以上是生活随笔為你收集整理的python 拆分(几G)的tsv文件为较小的csv文件的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 关于图像金字塔
- 下一篇: 计算摄影——风格迁移