Skip to content

collections — 容器数据类型

collections 是标准库中使用频率最高的模块之一,提供了比内置类型更强大的容器。

defaultdict — 带默认值的字典

python
from collections import defaultdict

# 传统写法(繁琐)
word_count = {}
for word in "apple banana apple cherry banana apple".split():
    if word not in word_count:
        word_count[word] = 0
    word_count[word] += 1

# defaultdict 写法(简洁)
word_count = defaultdict(int)
for word in "apple banana apple cherry banana apple".split():
    word_count[word] += 1

print(dict(word_count))  # {'apple': 3, 'banana': 2, 'cherry': 1}

# 分组
from collections import defaultdict

groups = defaultdict(list)
data = [("fruit", "apple"), ("veggie", "carrot"), ("fruit", "banana")]
for category, item in data:
    groups[category].append(item)

print(dict(groups))  # {'fruit': ['apple', 'banana'], 'veggie': ['carrot']}

# 嵌套 defaultdict
nested = defaultdict(lambda: defaultdict(int))
nested["user1"]["login"] += 1
nested["user1"]["purchase"] += 3

Counter — 计数器

python
from collections import Counter

# 创建
c1 = Counter("abracadabra")
c2 = Counter(["apple", "banana", "apple", "cherry"])
c3 = Counter({"a": 3, "b": 2})

print(c1)  # Counter({'a': 5, 'b': 2, 'r': 2, 'c': 1, 'd': 1})

# 最常见的 N 个
print(c1.most_common(3))  # [('a', 5), ('b', 2), ('r', 2)]

# 算术运算
c4 = Counter("aab")
c5 = Counter("abc")
print(c4 + c5)  # Counter({'a': 3, 'b': 2, 'c': 1})
print(c4 - c5)  # Counter({'a': 1})
print(c4 & c5)  # Counter({'a': 1, 'b': 1}) — 交集(取最小)
print(c4 | c5)  # Counter({'a': 2, 'b': 1, 'c': 1}) — 并集(取最大)

# 实战:词频统计
import re
text = "Python is great. Python is easy. Python is powerful."
words = re.findall(r'\w+', text.lower())
freq = Counter(words)
print(freq.most_common(5))

deque — 双端队列

python
from collections import deque

# 创建(可设置最大长度)
dq = deque([1, 2, 3, 4, 5], maxlen=5)

# O(1) 两端操作(list 的 insert(0, x) 是 O(n))
dq.appendleft(0)   # 左端添加,自动弹出右端 5
dq.append(6)       # 右端添加,自动弹出左端 0
print(dq)          # deque([1, 2, 3, 4, 6], maxlen=5)

dq.popleft()       # O(1)
dq.pop()           # O(1)

# 旋转
dq = deque([1, 2, 3, 4, 5])
dq.rotate(2)       # 右旋 2 步
print(dq)          # deque([4, 5, 1, 2, 3])
dq.rotate(-2)      # 左旋 2 步
print(dq)          # deque([1, 2, 3, 4, 5])

# 实战:滑动窗口
def moving_average(data, window_size):
    window = deque(maxlen=window_size)
    for value in data:
        window.append(value)
        if len(window) == window_size:
            yield sum(window) / window_size

data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
print(list(moving_average(data, 3)))
# [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]

namedtuple — 命名元组

python
from collections import namedtuple

# 创建命名元组类
Point = namedtuple('Point', ['x', 'y'])
Color = namedtuple('Color', 'r g b alpha', defaults=[255])

p = Point(3, 4)
print(p.x, p.y)    # 3 4
print(p[0], p[1])  # 3 4(仍然支持索引)
print(p._asdict()) # {'x': 3, 'y': 4}

# 不可变
# p.x = 5  # AttributeError

# 替换字段值(返回新对象)
p2 = p._replace(x=10)
print(p2)  # Point(x=10, y=4)

# 实战:CSV 数据处理
import csv
from collections import namedtuple

Employee = namedtuple('Employee', ['name', 'dept', 'salary'])

employees = [
    Employee('Alice', 'Engineering', 90000),
    Employee('Bob', 'Marketing', 75000),
    Employee('Charlie', 'Engineering', 85000),
]

# 按部门分组
by_dept = defaultdict(list)
for emp in employees:
    by_dept[emp.dept].append(emp)

# 计算各部门平均薪资
for dept, emps in by_dept.items():
    avg = sum(e.salary for e in emps) / len(emps)
    print(f"{dept}: 平均薪资 {avg:,.0f}")

OrderedDict — 有序字典

Python 3.7+ 内置 dict 已保证插入顺序,但 OrderedDict 有额外功能:

python
from collections import OrderedDict

od = OrderedDict()
od['a'] = 1
od['b'] = 2
od['c'] = 3

# 移动到末尾/开头
od.move_to_end('a')        # 移到末尾
od.move_to_end('c', last=False)  # 移到开头

# 比较时考虑顺序(普通 dict 不考虑)
d1 = OrderedDict([('a', 1), ('b', 2)])
d2 = OrderedDict([('b', 2), ('a', 1)])
print(d1 == d2)  # False(顺序不同)

# LRU 缓存实现
class LRUCache:
    def __init__(self, capacity: int):
        self.cache = OrderedDict()
        self.capacity = capacity

    def get(self, key: int) -> int:
        if key not in self.cache:
            return -1
        self.cache.move_to_end(key)
        return self.cache[key]

    def put(self, key: int, value: int) -> None:
        if key in self.cache:
            self.cache.move_to_end(key)
        self.cache[key] = value
        if len(self.cache) > self.capacity:
            self.cache.popitem(last=False)  # 删除最旧的

ChainMap — 链式映射

python
from collections import ChainMap

# 多个字典的逻辑合并(不复制数据)
defaults = {'color': 'red', 'user': 'guest', 'timeout': 30}
env_vars = {'user': 'admin', 'timeout': 60}
cli_args = {'color': 'blue'}

# 优先级:cli_args > env_vars > defaults
config = ChainMap(cli_args, env_vars, defaults)
print(config['color'])    # blue(来自 cli_args)
print(config['user'])     # admin(来自 env_vars)
print(config['timeout'])  # 60(来自 env_vars)

# 修改只影响第一个 map
config['new_key'] = 'value'
print(cli_args)  # {'color': 'blue', 'new_key': 'value'}

UserDict / UserList / UserString

用于创建自定义容器类(比直接继承内置类型更安全):

python
from collections import UserDict

class CaseInsensitiveDict(UserDict):
    """键不区分大小写的字典"""
    def __setitem__(self, key, value):
        super().__setitem__(key.lower(), value)

    def __getitem__(self, key):
        return super().__getitem__(key.lower())

    def __contains__(self, key):
        return super().__contains__(key.lower())

d = CaseInsensitiveDict()
d['Name'] = 'Alice'
print(d['name'])   # Alice
print(d['NAME'])   # Alice
print('name' in d) # True

选择指南

  • 需要计数 → Counter
  • 需要默认值 → defaultdict
  • 需要双端操作或固定大小缓冲区 → deque
  • 需要轻量级数据对象 → namedtupledataclass
  • 需要合并多个配置层 → ChainMap

本站内容由 褚成志 整理编写,仅供学习参考