如何使用Python读取腾讯在线文档

最编程 2024-01-02 19:19:35

...

需求背景：

读取腾讯文档在线表格

腾讯文档示例

实现思路：

利用python+requests接口自动化思路，先将在线文档导出至本地，然后用pandas的read_excel方法进行读取,过程中采用fiddler进行抓包分析。

详细步骤：

1、创建导出任务：提取返回的操作id

导出

image

2、查询导出任务：轮询至进度为100，提取返回的file_url

查询导出进度接口示例

3、get方法请求file_url下载文件

下载文件接口示例

4、写入本地文件


file_name= "test.xlsx"

# 若存在则先删除文件

if os.path.exists(file_name):

    os.remove(file_name)

# 写入文件

with open(file_name,"wb")as code:

    code.write(content.content)

print("下载完成。")

5、读取本地文件


io= r'C:\Users\user\PycharmProjects\xxx\test.xlsx'

# 读取表格，只读取sheet_name为xx，以第二行为表头，只读取指定行

data= pd.read_excel(io,sheet_name='xxx',header=1,usecols=[0,1,2,5,6,7,8])

核心问题：

各个接口的登录态问题，共需要5个cookie键值对,一般同一个企业微信只有uid_key、wedrive_ticket会过期，

（注意：若该账号再次登录访问该腾讯文档会刷新cookie，到时候只需要替换新的uid_key、wedrive_ticket即可。）

uid_key=xx

uid=xxx

wedrive_sid=xxxxxx

wedrive_skey=xxxxxxx

wedrive_ticket=xxxxxx

登录态示例

实战代码：-小白代码请轻喷


# -*- coding: utf-8 -*-

"""

__author__:  @xxx

__datetime__:  xxx

"""

import json

import os

from timeimport sleep

# 生成导出任务配置

import click

import pandasas pd

import requests

# 导出表格地址

EXPORT_OFFICE_URL= "https://doc.weixin.qq.com/v1/export/export_office"

# 查询导出任务配置

QUERY_PROGRESS_URL= "https://doc.weixin.qq.com/v1/export/query_progress"

# 腾讯文档地址

project_excel_url= "https://doc.weixin.qq.com/sheet/xxx"

project_excel_name= "xxx表"

class ProjectRemindRobot:

    """导出腾讯文档到本地"""

    def __init__(self,

uid: str,

uid_key: str,

doc_id: str,

file_name: str,

wedrive_ticket: str,

wedrive_skey: str,

wedrive_sid: str

                ):

        self.uid= uid

        self.uid_key= uid_key

        self.doc_id= doc_id

        self.file_name= file_name

        self.wedrive_ticket= wedrive_ticket

        self.wedrive_skey= wedrive_skey

        self.wedrive_sid= wedrive_sid

        # uid doc_id wedrive_skey wedrive_sid通常不变

    def create_export_office_task(self):

        """

        创建导出文件任务"""

        # 组织请求cookie并赋为全局变量方便后面接口使用

        global cookie_value

cookie_value= 'uid=%s; uid_key=%s; wedrive_ticket=%s; wedrive_skey=%s; wedrive_sid=%s;' % (

self.uid,self.uid_key,self.wedrive_ticket,self.wedrive_skey,self.wedrive_sid)

doc_id_value= '%s' % self.doc_id

headers= {'content-type': 'application/x-www-form-urlencoded','Cookie': cookie_value}

# 请求体

        body= {'docId': doc_id_value,'version': '2'}

#调用request发送post请求

        response_body= requests.post(url=EXPORT_OFFICE_URL,headers=headers,data=body)

print("添加导出任务返回内容为：", response_body.text)

return response_body

def query_progress_task(self,operation_id):

        """

        查询导出文件任务进度"""

        # 组织请求cookie

        headers= {'content-type': 'application/x-www-form-urlencoded','Cookie': cookie_value}

# 请求体

        body= {'operationId': operation_id}

#调用request发送get请求

        response_body= requests.get(url=QUERY_PROGRESS_URL,headers=headers,params=body)

print("查询进度内容为：", response_body.text)

return response_body

def down_file(self,file_url):

        """

        下载文件"""

        headers= {'Cookie': cookie_value}

response_body= requests.get(url=file_url,headers=headers)

print("查询进度内容为：", response_body)

return response_body

def read_data(self):

        io= r'C:\Users\user\PycharmProjects\xxx\test.xlsx'

        # 读取表格，只读取sheet_name为xx，以第二行为表头，只读取指定行

        data= pd.read_excel(io,sheet_name='xxx'）',header=1,usecols=[0,1,2,5,6,7,8])

return data

def main(self):

        """

        程序入口"""

        # 1、创建任务

        resp_create_export_office= self.create_export_office_task()

print("添加导出任务返回内容为：%s" % resp_create_export_office.text)

operation_id= resp_create_export_office.json()["operationId"]

print("创建任务成功：%s" % operation_id)

# 2、轮询任务，直到 100% 停止

        while True:

            resp_query_progress= self.query_progress_task(operation_id)

progress= resp_query_progress.json()["progress"]

print("|" + "#" * progress+ "| " + "%s/100" % progress)

if progress>= 100:

                file_url= resp_query_progress.json()["file_url"]

print("导出任务完成：%s" % resp_query_progress)

break

            sleep(1)

# 3、下载文件

        print("开始下载文件。。。")

content= self.down_file(file_url)

print(type(content))

file_name= "test.xlsx"

        # 若存在则先删除文件

        if os.path.exists(file_name):

            os.remove(file_name)

# 写入文件

        with open(file_name,"wb")as code:

            code.write(content.content)

print("下载完成。")

# 读取文件

        self.read_data()

if __name__== '__main__':

    # 只需要定时替换uid_key wedrive_ticket

    prr= ProjectRemindRobot(

uid="xxx",

doc_id="xx",

file_name="export_file.xlsx",

wedrive_skey="xx",

wedrive_sid="xx",

uid_key="xxx",

wedrive_ticket="xxx",

)

prr.main()

上一篇：腾讯文档适用于哪些格式的在线协作？

下一篇： CODING 代码托管

如何使用Python读取腾讯在线文档

需求背景：

实现思路：

详细步骤：

核心问题：

如何使用 python 详细读取 micaps 文件

如何使用Python获取WPS文档内容

如何在ROS中使用Python读取雷达生成的PCD点云文件

如何使用Python实现XML文件与数据库之间的数据读取与转换操作

南邮OJ Web任务大揭秘：层层挑战剖析 1. 挑战一：迷宫般的目录探索题目作者似乎穷举了所有可能的目录组合，最终在404.php中的