
- Install Python 3.8 or newer.
- Install
pip install pdfminer.six , pandans , xlwings , dateparser
# -*- coding: utf-8 -*-
# Date : 2022/4/29
# Author: Xiaosan
# Email : ainoone@outlook.com
import json
import re
from collections import defaultdict, UserDict, UserList
from dataclasses import dataclass, field
from pathlib import Path
from typing import Union, Any
import pandas as pd
import xlwings as xw
from dateparser import parse
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox
FileOrName = Union[Path, str]
Desktop = Path.home() / 'Desktop' # 桌面路径
def desktop_fd():
"""Desktop folder"""
dst_dir_fd = Desktop / 'Result'
dst_dir_fd.mkdir(exist_ok=True)
return dst_dir_fd
def auto_excel(fun):
def wrapper(*args, **kwargs):
fullname = fun(*args, **kwargs)
output = Path(fullname).with_suffix('.xlsx')
df = pd.read_json(fullname).convert_dtypes().T.fillna('')
with xw.App(visible=False, add_book=True) as app:
app.display_alerts = False
app.screen_updating = False
wb = app.books.active
sht = wb.sheets.active
sht.range("C:C").api.NumberFormat = "@" # 文本格式
sht.range("H:H").api.NumberFormat = "@"
sht.range("I:I").api.NumberFormat = "@"
sht.range('A1').value = df
sht.autofit() # 行、列根据内容进行自适应
rng = sht.range('A1').expand('right') # 表头
rng.color = (128, 128, 128) # 填充颜色
rng.row_height = 15 # 行高
rng.font.color = (255, 255, 255) # 字体颜色
rng.font.bold = True # 粗体
rng.api.HorizontalAlignment = -4108 # 水平居中
rng.api.VerticalAlignment = -4130 # 垂直居中
wb.save(output)
wb.close()
return fullname
return wrapper
@dataclass
class FileDictLists(UserDict):
"""文件字典列表"""
dir_path: str = None
_dst: Path = field(default_factory=desktop_fd)
def __post_init__(self):
super().__init__()
if self.dir_path is None:
self.path = Path(__file__).parent.resolve()
else:
self.path = Path(self.dir_path).parent.resolve()
retval = defaultdict(list) # retval --> return value 返回值
suffix = '*.pdf'
for file in self.path.rglob(suffix):
retval[file.parent].append(file)
self.data.update(retval)
@property
def dirs(self):
dirs = (dirs for dirs in self.data.keys())
return dirs
@property
def filenames(self):
files = (files.as_posix() for dir_path in self.dirs for files in dir_path.glob('*.pdf'))
return files
@property
def relative_path_filenames(self):
"""Relative path file name"""
series = pd.Series(list(self.filenames))
retval = series.str.replace(self.path.as_posix(), '.', regex=False).to_list()
return retval
def dump_json(self):
file_lists = self.relative_path_filenames
data = dict.fromkeys(file_lists)
fp_json = self._dst / 'File list.json'
with open(fp_json, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4, ensure_ascii=False, separators=(',', ':'))
class RegEx:
"""Regular expression configuration"""
# -------------------------------------------------------------------------
# InvoiceNumber(发票号码):长度8位,($)必须需要,否则会匹配发票代码前8位
# InvoiceCode(发票代码):长度为12,最后一位数为0, 没有机器编号 --> 区域链电子发票
# 长度为12,第一位数:0,最后2位:11 --> 普通电票
# TaxPayerNum(纳税人识别号):(91或92)开头,长度:18位,格式:纯数字或数字&大写字母组合
# -------------------------------------------------------------------------
# TODO 发票号码
InvoiceNumber = re.compile(r'\d{8}$')
# TODO 发票代码
InvoiceCode = re.compile(r'^[01]\d{9}(11|\d0)$')
# TODO 开票日期
BillingDate_0 = re.compile(r'(20)\d{2}年\d{1,2}月\d{1,2}日')
BillingDate_1 = re.compile(r'^(20)\d{2}\d*$')
# TODO 发票税率
TaxRate = re.compile(r'13%|9%|6%|5%|3%|1%|0%|免税|不征税|\*{3}$')
# TODO 价税合计
# TotalTaxSum = re.compile(r'(?<=[¥¥])[\d.]*')
TotalTaxSum = re.compile(r'(\d+)\.\d{2}')
# TODO 购买方和销售方纳税人识别号
TaxPayerNum = re.compile(r'^(91|92)[A-Z\d]{16}', flags=re.A) # 税局代开发票暂未添加
class ContentList(UserList):
regex = RegEx()
def __iter__(self):
for content in self.data:
yield re.sub(r'\s+', '', content)
def _get_invoice_code(self):
for text in self:
if content := self.regex.InvoiceCode.match(text):
return content.group()
def _get_invoice_number(self):
for text in self:
if content := self.regex.InvoiceNumber.match(text):
return content.group()
def _get_bill_date(self):
for text in self:
if content := self.regex.BillingDate_0.match(text) \
or self.regex.BillingDate_1.match(text):
date = content.group()
if not date:
return
else: # 语言:中文,时间:中国
return parse(date, date_formats=['%Y%m%d'], languages=['zh'],
settings={'TIMEZONE': 'Asia/Shanghai'}).strftime('%Y-%m-%d')
def _get_tax_rate(self):
for text in self:
if content := self.regex.TaxRate.match(text):
return content.group()
def _get_tts(self):
"""Total Tax Sum"""
array = []
for text in self:
if content := self.regex.TotalTaxSum.search(text):
array.append(content.group())
if not array:
return
else:
return array[-1]
def _get_tpn(self):
"""Tax Payer Num"""
temp = []
retval = dict.fromkeys(['taxpayerNumber', 'salesTaxpayerNum'])
for text in self:
if content := self.regex.TaxPayerNum.match(text):
temp.append(content.group())
if len(temp) == 2:
break
if not temp:
return retval
if (num := len(temp)) == 1: # 购买方未填写纳税人识别号
retval['salesTaxpayerNum'] = temp[-1] # 销售方纳税人识别号
if num == 2:
retval['taxpayerNumber'] = temp[0] # 购买方纳税人识别号
retval['salesTaxpayerNum'] = temp[-1]
return retval
def dump_dict(self):
if self.data:
invoice_code = self._get_invoice_code()
invoice_number = self._get_invoice_number()
bill_time = self._get_bill_date()
tax_rate = self._get_tax_rate()
total_tax_sum = self._get_tts()
taxpayer_num = self._get_tpn()
fp_data = dict(
anchor=True, # 锚点
InvoiceCode=invoice_code,
InvoiceNumber=invoice_number,
BillTime=bill_time,
TaxRate=tax_rate,
TotalTaxSum=total_tax_sum,
TaxPayerNum=taxpayer_num.get('taxpayerNumber'),
SalesTaxPayerNum=taxpayer_num.get('salesTaxpayerNum')
)
return fp_data
return dict(anchor=False)
@dataclass
class EInvoice:
"""电子发票 : e-invoice"""
pdf_file: FileOrName
data: Any = field(default_factory=ContentList)
def _extract_content(self):
for page_layout in extract_pages(pdf_file=self.pdf_file, page_numbers=[0]):
for element in page_layout:
if isinstance(element, LTTextBox): #
# TODO 移除长度小于2的字符
if text := filter(lambda x: len(x) > 2, element):
for content in text:
self.data.append(content.get_text().strip()) # 移除其中的前导和末尾空白字符
if not self.data:
print(f'{self.pdf_file} 文件无法解析。')
break
def converter_dict(self):
self._extract_content()
retval = {}
d = self.data.dump_dict()
retval[f'{self.pdf_file}'] = d
return retval
def load() -> dict:
fullname = Desktop / 'Result/File list.json' # 目标文件路径
if not Path(fullname).exists():
FileDictLists().dump_json()
with open(fullname, 'rb') as f:
return json.load(f)
@auto_excel
def dump2json(obj: dict = None) -> str:
retval = obj
for file in obj.keys():
content = EInvoice(file)
d = content.converter_dict()
retval.update(d)
fp_json = Desktop / 'Result/data.json'
with open(fp_json, 'w', encoding='utf-8') as f:
json.dump(retval, f, indent=4, ensure_ascii=False, separators=(',', ':'))
return fp_json.as_posix() # --> 返回 / 路径字符串
if __name__ == '__main__':
in_put = load()
engine = dump2json(in_put)
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)