importduckdb# 直接查询CSV文件,不需要先导入!result=duckdb.sql(""" SELECT city, COUNT(*) AS user_count, AVG(age) AS avg_age FROM 'users.csv' -- 直接写文件路径当表名 GROUP BY city ORDER BY user_count DESC""")print(result)# 查询Parquet文件duckdb.sql("SELECT * FROM 'data/*.parquet'")# 支持通配符,读取所有parquet文件# 查询JSON文件duckdb.sql("SELECT * FROM read_json_auto('events.json')")# 查询Excel文件duckdb.sql(""" INSTALL spatial; -- 安装扩展 LOAD spatial; SELECT * FROM st_read('data.xlsx')""")# 查询远程文件(HTTP/S3)duckdb.sql(""" SELECT * FROM 'https://example.com/data.csv' LIMIT 10""")# 查询S3上的Parquetduckdb.sql(""" SELECT * FROM read_parquet('s3://bucket/path/data.parquet')""")
importduckdbimportpandasaspdimportpolarsaspl# pandas DataFrame直接当SQL表用pandas_df=pd.DataFrame({"name":["Alice","Bob","Charlie"],"score":[95,87,92],})# 直接在SQL中引用pandas变量名!result=duckdb.sql(""" SELECT name, score FROM pandas_df -- 直接用Python变量名 WHERE score > 90 ORDER BY score DESC""")# 结果转pandaspandas_result=result.df()# 转pandas DataFrame# 结果转Polarspolars_result=result.pl()# 转Polars DataFrame# Polars DataFrame也能直接查询polars_df=pl.DataFrame({"x":[1,2,3],"y":[4,5,6]})duckdb.sql("SELECT * FROM polars_df WHERE x > 1")
duckdb.sql(""" SELECT name, department, salary, -- 部门内薪资排名 RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dept_rank, -- 部门平均薪资 AVG(salary) OVER (PARTITION BY department) AS dept_avg, -- 与上一个人的薪资差 salary - LAG(salary) OVER (ORDER BY salary) AS salary_diff FROM 'employees.csv'""")
# 行转列duckdb.sql(""" PIVOT (SELECT city, quarter, sales FROM sales_data) ON quarter USING SUM(sales) -- 每个季度的销售额变成单独的列""")# 列转行duckdb.sql(""" UNPIVOT sales_wide ON q1, q2, q3, q4 -- 这些列转成行 INTO NAME quarter VALUE sales -- 新列名""")