使用另一个bytestring

时间:2017-06-28 20:50:51

标签: haskell lazy-evaluation bytestring

如何将 lazy 字节字符串与另一个字节字符串(例如"\r\n")分开?我正在寻找以下功能:

BSL.ByteString -> BSL.ByteString -> [BSL.ByteString]

我知道breakSubstring但该功能仅适用于严格的字节串。我也看到了这个question,但解决方案是使用严格的字节串。

1 个答案:

答案 0 :(得分:1)

回答我自己的问题:我创建了一个pull request,将breakSubstring添加到Data.ByteString.Lazy(改编自严格版本)。

在合并拉取请求之前,可以使用以下代码:

{-# LANGUAGE BangPatterns #-}

module Lib (breakSubstring) where

import Data.Bits (finiteBitSize, shiftL, (.|.), (.&.))
import Data.Word (Word32)
import Prelude

import qualified Data.ByteString.Lazy as BSL


breakSubstring
  :: BSL.ByteString
  -> BSL.ByteString
  -> (BSL.ByteString, BSL.ByteString)
breakSubstring pat =
  case lp of
    0 -> \src -> (BSL.empty, src)
    1 -> BSL.break (== BSL.head pat)
    _ -> if lp * 8 <= fromIntegral (finiteBitSize (0 :: Word))
             then shift
             else karpRabin
  where
    lp = BSL.length pat
    karpRabin :: BSL.ByteString -> (BSL.ByteString, BSL.ByteString)
    karpRabin src
        | BSL.length src < lp = (src, BSL.empty)
        | otherwise = search (rollingHash $ BSL.take lp src) lp
      where
        k           = 2891336453 :: Word32
        rollingHash = BSL.foldl' (\h b -> h * k + fromIntegral b) 0
        hp          = rollingHash pat
        m           = k ^ lp
        get = fromIntegral . BSL.index src
        search !hs !i
            | hp == hs && pat == BSL.take lp b = u
            | BSL.length src <= i              = (src, BSL.empty)
            | otherwise                        = search hs' (i + 1)
          where
            u@(_, b) = BSL.splitAt (i - lp) src
            hs' = hs * k +
                  get i -
                  m * get (i - lp)
    {-# INLINE karpRabin #-}

    shift :: BSL.ByteString -> (BSL.ByteString, BSL.ByteString)
    shift !src
        | BSL.length src < lp = (src, BSL.empty)
        | otherwise           = search (intoWord $ BSL.take lp src) lp
      where
        intoWord :: BSL.ByteString -> Word
        intoWord = BSL.foldl' (\w b -> (w `shiftL` 8) .|. fromIntegral b) 0
        wp   = intoWord pat
        mask = (1 `shiftL` fromIntegral (8 * lp)) - 1
        search !w !i
            | w == wp             = BSL.splitAt (i - lp) src
            | BSL.length src <= i = (src, BSL.empty)
            | otherwise           = search w' (i + 1)
          where
            b  = fromIntegral (BSL.index src i)
            w' = mask .&. ((w `shiftL` 8) .|. b)
    {-# INLINE shift #-}