collections — 容器数据类型
collections是标准库中使用频率最高的模块之一,提供了比内置类型更强大的容器。
defaultdict — 带默认值的字典
python
from collections import defaultdict
# 传统写法(繁琐)
word_count = {}
for word in "apple banana apple cherry banana apple".split():
if word not in word_count:
word_count[word] = 0
word_count[word] += 1
# defaultdict 写法(简洁)
word_count = defaultdict(int)
for word in "apple banana apple cherry banana apple".split():
word_count[word] += 1
print(dict(word_count)) # {'apple': 3, 'banana': 2, 'cherry': 1}
# 分组
from collections import defaultdict
groups = defaultdict(list)
data = [("fruit", "apple"), ("veggie", "carrot"), ("fruit", "banana")]
for category, item in data:
groups[category].append(item)
print(dict(groups)) # {'fruit': ['apple', 'banana'], 'veggie': ['carrot']}
# 嵌套 defaultdict
nested = defaultdict(lambda: defaultdict(int))
nested["user1"]["login"] += 1
nested["user1"]["purchase"] += 3Counter — 计数器
python
from collections import Counter
# 创建
c1 = Counter("abracadabra")
c2 = Counter(["apple", "banana", "apple", "cherry"])
c3 = Counter({"a": 3, "b": 2})
print(c1) # Counter({'a': 5, 'b': 2, 'r': 2, 'c': 1, 'd': 1})
# 最常见的 N 个
print(c1.most_common(3)) # [('a', 5), ('b', 2), ('r', 2)]
# 算术运算
c4 = Counter("aab")
c5 = Counter("abc")
print(c4 + c5) # Counter({'a': 3, 'b': 2, 'c': 1})
print(c4 - c5) # Counter({'a': 1})
print(c4 & c5) # Counter({'a': 1, 'b': 1}) — 交集(取最小)
print(c4 | c5) # Counter({'a': 2, 'b': 1, 'c': 1}) — 并集(取最大)
# 实战:词频统计
import re
text = "Python is great. Python is easy. Python is powerful."
words = re.findall(r'\w+', text.lower())
freq = Counter(words)
print(freq.most_common(5))deque — 双端队列
python
from collections import deque
# 创建(可设置最大长度)
dq = deque([1, 2, 3, 4, 5], maxlen=5)
# O(1) 两端操作(list 的 insert(0, x) 是 O(n))
dq.appendleft(0) # 左端添加,自动弹出右端 5
dq.append(6) # 右端添加,自动弹出左端 0
print(dq) # deque([1, 2, 3, 4, 6], maxlen=5)
dq.popleft() # O(1)
dq.pop() # O(1)
# 旋转
dq = deque([1, 2, 3, 4, 5])
dq.rotate(2) # 右旋 2 步
print(dq) # deque([4, 5, 1, 2, 3])
dq.rotate(-2) # 左旋 2 步
print(dq) # deque([1, 2, 3, 4, 5])
# 实战:滑动窗口
def moving_average(data, window_size):
window = deque(maxlen=window_size)
for value in data:
window.append(value)
if len(window) == window_size:
yield sum(window) / window_size
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
print(list(moving_average(data, 3)))
# [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]namedtuple — 命名元组
python
from collections import namedtuple
# 创建命名元组类
Point = namedtuple('Point', ['x', 'y'])
Color = namedtuple('Color', 'r g b alpha', defaults=[255])
p = Point(3, 4)
print(p.x, p.y) # 3 4
print(p[0], p[1]) # 3 4(仍然支持索引)
print(p._asdict()) # {'x': 3, 'y': 4}
# 不可变
# p.x = 5 # AttributeError
# 替换字段值(返回新对象)
p2 = p._replace(x=10)
print(p2) # Point(x=10, y=4)
# 实战:CSV 数据处理
import csv
from collections import namedtuple
Employee = namedtuple('Employee', ['name', 'dept', 'salary'])
employees = [
Employee('Alice', 'Engineering', 90000),
Employee('Bob', 'Marketing', 75000),
Employee('Charlie', 'Engineering', 85000),
]
# 按部门分组
by_dept = defaultdict(list)
for emp in employees:
by_dept[emp.dept].append(emp)
# 计算各部门平均薪资
for dept, emps in by_dept.items():
avg = sum(e.salary for e in emps) / len(emps)
print(f"{dept}: 平均薪资 {avg:,.0f}")OrderedDict — 有序字典
Python 3.7+ 内置 dict 已保证插入顺序,但 OrderedDict 有额外功能:
python
from collections import OrderedDict
od = OrderedDict()
od['a'] = 1
od['b'] = 2
od['c'] = 3
# 移动到末尾/开头
od.move_to_end('a') # 移到末尾
od.move_to_end('c', last=False) # 移到开头
# 比较时考虑顺序(普通 dict 不考虑)
d1 = OrderedDict([('a', 1), ('b', 2)])
d2 = OrderedDict([('b', 2), ('a', 1)])
print(d1 == d2) # False(顺序不同)
# LRU 缓存实现
class LRUCache:
def __init__(self, capacity: int):
self.cache = OrderedDict()
self.capacity = capacity
def get(self, key: int) -> int:
if key not in self.cache:
return -1
self.cache.move_to_end(key)
return self.cache[key]
def put(self, key: int, value: int) -> None:
if key in self.cache:
self.cache.move_to_end(key)
self.cache[key] = value
if len(self.cache) > self.capacity:
self.cache.popitem(last=False) # 删除最旧的ChainMap — 链式映射
python
from collections import ChainMap
# 多个字典的逻辑合并(不复制数据)
defaults = {'color': 'red', 'user': 'guest', 'timeout': 30}
env_vars = {'user': 'admin', 'timeout': 60}
cli_args = {'color': 'blue'}
# 优先级:cli_args > env_vars > defaults
config = ChainMap(cli_args, env_vars, defaults)
print(config['color']) # blue(来自 cli_args)
print(config['user']) # admin(来自 env_vars)
print(config['timeout']) # 60(来自 env_vars)
# 修改只影响第一个 map
config['new_key'] = 'value'
print(cli_args) # {'color': 'blue', 'new_key': 'value'}UserDict / UserList / UserString
用于创建自定义容器类(比直接继承内置类型更安全):
python
from collections import UserDict
class CaseInsensitiveDict(UserDict):
"""键不区分大小写的字典"""
def __setitem__(self, key, value):
super().__setitem__(key.lower(), value)
def __getitem__(self, key):
return super().__getitem__(key.lower())
def __contains__(self, key):
return super().__contains__(key.lower())
d = CaseInsensitiveDict()
d['Name'] = 'Alice'
print(d['name']) # Alice
print(d['NAME']) # Alice
print('name' in d) # True选择指南
- 需要计数 →
Counter - 需要默认值 →
defaultdict - 需要双端操作或固定大小缓冲区 →
deque - 需要轻量级数据对象 →
namedtuple或dataclass - 需要合并多个配置层 →
ChainMap